Merge branch 'hotfix/v0.5.1'

Double precision compile fix
2025-06-15 22:37:05 +01:00 · 2016-07-01 16:33:59 +01:00
257 changed files with 8840 additions and 17693 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@
 *.o
 *.obj
 # Editor files #
 ################
 *~
@ -47,7 +48,6 @@ Config.h.in
 config.log
 config.status
 .deps
 *.inc
 # http://www.gnu.org/software/autoconf #
 ########################################
@ -63,7 +63,19 @@ config.sub
 config.guess
 INSTALL
 .dirstamp
-ltmain.sh
+
 # Packages #
 ############
 # it's better to unpack these files and commit the raw source
 # git has its own built in compression methods
 *.7z
 *.dmg
 *.gz
 *.iso
 *.jar
 *.rar
 *.tar
 *.zip
 # Logs and databases #
 ######################
@ -89,16 +101,3 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
 # Eigen source #
 ################
 lib/Eigen/*
 # FFTW source #
 ################
 lib/fftw/*
 # libtool macros #
 ##################
 m4/lt*
 m4/libtool.m4
--- a/.travis.yml
+++ b/.travis.yml
@ -9,6 +9,10 @@ matrix:
    - os:        osx
      osx_image: xcode7.2
      compiler: clang
    - os:        osx
      osx_image: xcode7.2
      compiler: gcc
      env: VERSION=-5
    - compiler: gcc
      addons:
        apt:
@ -19,8 +23,6 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
@ -33,8 +35,6 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
@ -47,8 +47,6 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
@ -61,8 +59,6 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
@ -73,7 +69,6 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 install:
@ -87,20 +82,13 @@ install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 script:
-    - ./bootstrap.sh
+    - ./scripts/reconfigure_script
    - mkdir build
    - cd build
-    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
-    - echo make clean
+    - make clean
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
+    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
    - make -j4
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+    - ./benchmarks/Benchmark_dwf --threads 1
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/Makefile.am
+++ b/Makefile.am
@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = lib benchmarks tests
+AM_CXXFLAGS = -I$(top_srcdir)/
 SUBDIRS = lib tests benchmarks
-AM_CXXFLAGS += -I$(top_builddir)/include
+filelist: $(SUBDIRS)
 ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@ -1,28 +1,8 @@
-# Grid
+# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
-<table>
+Data parallel C++ mathematical object library
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
-**Data parallel C++ mathematical object library.**
+Last update 2015/7/30
 Please send all pull requests to the `develop` branch.
 License: GPL v2.
 Last update 2016/08/03.
 ### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
@ -42,75 +22,37 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON and BG/Q QPX on the way).
+Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON on the way).
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+These are presented as 
-The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+
     vRealF, vRealD, vComplexF, vComplexD 
 internal vector data types. These may be useful in themselves for other programmers.
 The corresponding scalar types are named
     RealF, RealD, ComplexF, ComplexD
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
-### Installation
+   You can give `configure' initial values for configuration parameters
-First, start by cloning the repository:
+by setting variables in the command line or in the environment.  Here
 are examples:
-``` bash
+     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
 git clone https://github.com/paboyle/Grid.git
 ```
-Then enter the cloned directory and set up the build system:
+     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX
-``` bash
+     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
 cd Grid
 ./bootstrap.sh
 ```
-Now you can execute the `configure` script to generate makefiles (here from a build directory):
+     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
 Note: Before running configure it could be necessary to execute the script 
       script/filelist
 ``` bash
 mkdir build; cd build
 ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```
-where `--enable-precision=` set the default precision (`single` or `double`),
+     
-`--enable-simd=` set the SIMD type (see possible values below), `--enable-
+For developers:
-comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or
+Use reconfigure_script in the scripts/ directory to create the autotools environment 
 `shmem`), and `<path>` should be replaced by the prefix path where you want to
 install Grid. The `mpi-auto` communication option set `configure` to determine
 automatically how to link to MPI. Other options are available, use `configure
 --help` to display them. Like with any other program using GNU autotool, the
 `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
 customise the build.
 Finally, you can build and install Grid:
 ``` bash
 make; make install
 ```
 To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
 ``` bash
 make -C tests/<subdir> tests
 ```
 ### Possible SIMD types
 The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
 | String      | Description                            |
 | ----------- | -------------------------------------- |
 | `GEN`       | generic portable vector code           |
 | `SSE4`      | SSE 4.2 (128 bit)                      |
 | `AVX`       | AVX (256 bit)                          |
 | `AVXFMA4`   | AVX (256 bit) + FMA                    |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
 | `AVX512MIC` | AVX 512 bit for Intel MIC architecture |
 | `ICMI`      | Intel ICMI instructions (512 bit)      |
 Alternatively, some CPU codenames can be directly used:
 | String      | Description                            |
 | ----------- | -------------------------------------- |
 | `KNC`       | [Intel Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
 | `KNL`       | [Intel Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
@ -194,128 +194,7 @@ int main (int argc, char ** argv)
    }
  }  
 #if 0
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      std::vector<CartesianCommunicator::CommsRequest_t> empty;
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
      for(int mu=0;mu<4;mu++){
 	ncomm=0;
 	if (mpi_layout[mu]>1 ) {
 	  ncomm++;
 	  int comm_proc;
 	  int xmit_to_rank;
 	  int recv_from_rank;
 	  comm_proc=1;
 	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	  Grid.SendToRecvFromInit(requests_fwd[mu],
 				  (void *)&xbuf[mu][0],
 				  xmit_to_rank,
 				  (void *)&rbuf[mu][0],
 				  recv_from_rank,
 				  bytes);
 	  comm_proc = mpi_layout[mu]-1;
 	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	  Grid.SendToRecvFromInit(requests_bwd[mu],
 				  (void *)&xbuf[mu+4][0],
 				  xmit_to_rank,
 				  (void *)&rbuf[mu+4][0],
 				  recv_from_rank,
 				  bytes);
 	}
      }
      {
 	double start=usecond();
 	for(int i=0;i<Nloop;i++){
 	  for(int mu=0;mu<4;mu++){
 	    if (mpi_layout[mu]>1 ) {
 	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 	    }
 	  }
 	  Grid.Barrier();
 	}
 	double stop=usecond();
 	double dbytes    = bytes;
 	double xbytes    = Nloop*dbytes*2.0*ncomm;
 	double rbytes    = xbytes;
 	double bidibytes = xbytes+rbytes;
 	double time = stop-start;
 	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      }
      {
 	double start=usecond();
 	for(int i=0;i<Nloop;i++){
 	  for(int mu=0;mu<4;mu++){
 	    if (mpi_layout[mu]>1 ) {
 	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 	    }
 	  }
 	  Grid.Barrier();
 	}
 	double stop=usecond();
 	double dbytes    = bytes;
 	double xbytes    = Nloop*dbytes*2.0*ncomm;
 	double rbytes    = xbytes;
 	double bidibytes = xbytes+rbytes;
 	double time = stop-start;
 	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      }
    }
  }
 #endif
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -26,7 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
@ -45,9 +46,9 @@ struct scal {
  };
 bool overlapComms = false;
-typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
-typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
@ -70,8 +71,8 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
@ -86,6 +87,8 @@ int main (int argc, char ** argv)
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
@ -124,20 +127,21 @@ int main (int argc, char ** argv)
  RealD mass=0.1;
  RealD M5  =1.8;
  typename DomainWallFermionR::ImplParams params; 
  params.overlapCommsCompute = overlapComms;
  RealD NP = UGrid->_Nprocessors;
  for(int doasm=1;doasm<2;doasm++){
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
-  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
+  int ncall =10;
  int ncall =100;
  if (1) {
    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@ -153,21 +157,19 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    assert (norm2(err)< 1.0e-5 );
+    //    Dw.Report();
    Dw.Report();
  }
  if (1)
  {
-    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
    LatticeFermion sresult(sFGrid);
-
+    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
@ -179,9 +181,8 @@ int main (int argc, char ** argv)
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
-    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
+
    double t0=usecond();
    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
@ -191,25 +192,25 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
-    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
-    sDw.Report();
+    //  sDw.Report();
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-  sDw.Dhop(ssrc,sresult,0);
+	sDw.Dhop(ssrc,sresult,0);
-  PerformanceCounter Counter(i);
+	PerformanceCounter Counter(i);
-  Counter.Start();
+	Counter.Start();
-  sDw.Dhop(ssrc,sresult,0);
+	sDw.Dhop(ssrc,sresult,0);
-  Counter.Stop();
+	Counter.Stop();
-  Counter.Report();
+	Counter.Report();
      }
    }
    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
-    RealD sum=0;
+
    RealF sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
@ -220,19 +221,17 @@ int main (int argc, char ** argv)
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
-      if (norm2(normal-simd) > 1.0e-6 ) {
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
-	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
-	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
      }
    }}}}}
-    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
+    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
    assert (sum< 1.0e-5 );
    if (1) {
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion serr(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
@ -244,25 +243,23 @@ int main (int argc, char ** argv)
      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      serr = sr_eo-ssrc; 
      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
      sr_e = zero;
      sr_o = zero;
      sDw.ZeroCounters();
      sDw.stat.init("DhopEO");
      double t0=usecond();
-      for (int i = 0; i < ncall; i++) {
+      for(int i=0;i<ncall;i++){
-        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
+	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      }
      double t1=usecond();
      sDw.stat.print();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.Report();
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
@ -271,18 +268,9 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
-      RealD error = norm2(ssrc_e);
+      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
-
+      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
      if(error>1.0e-5) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
      }
    }
@ -296,25 +284,24 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-  ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-  ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@ -330,7 +317,6 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  {
    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
@ -341,8 +327,7 @@ int main (int argc, char ** argv)
    double flops=(1344.0*volume*ncall)/2;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
@ -357,14 +342,11 @@ int main (int argc, char ** argv)
  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  assert(norm2(err)<1.0e-5);
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-5);
  assert(norm2(src_o)<1.0e-5);
  }
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@ -26,7 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -26,7 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
@ -51,7 +52,7 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-  const int Ls=8;
+  const int Ls=16;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@ -61,8 +62,6 @@ int main (int argc, char ** argv)
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
@ -127,6 +126,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
@ -145,10 +145,11 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }
 #ifdef CHECK
-  if (1) {
+  if (1)
-
+  {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
@ -192,19 +193,20 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
    Counter.Report();
  }
-  if ( ! report ) {
+  if ( ! report ) 
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    {
-    double flops=1344*volume*ncall;
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
+      double flops=1344*volume*ncall;
-  }
+      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
    }
 #ifdef CHECK
-  err = ref-result; 
+    err = ref-result; 
-  RealD errd = norm2(err);
+    RealD errd = norm2(err);
-  if ( errd> 1.0e-4 ) {
+    if ( errd> 1.0e-4 ) {
-    std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
+      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
-    exit(-1);
+      exit(-1);
-  }
+    }
 #endif
  LatticeFermion src_e (FrbGrid);
@ -230,9 +232,10 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
      std::cout<< flops/(t1-t0);
    }
  }
 }
-#define CHECK_SDW
+#undef CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
@ -240,9 +243,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
@ -276,89 +277,93 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
    }
  }
  RealD mass=0.1;
  RealD M5  =1.8;
-  typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
-  LatticeFermion ssrc(sFGrid);
+    LatticeFermion ssrc(sFGrid);
-  LatticeFermion sref(sFGrid);
+    LatticeFermion sref(sFGrid);
-  LatticeFermion sresult(sFGrid);
+    LatticeFermion sresult(sFGrid);
-  WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
+    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
-  for(int x=0;x<latt4[0];x++){
+    for(int x=0;x<latt4[0];x++){
-  for(int y=0;y<latt4[1];y++){
+    for(int y=0;y<latt4[1];y++){
-  for(int z=0;z<latt4[2];z++){
+    for(int z=0;z<latt4[2];z++){
-  for(int t=0;t<latt4[3];t++){
+    for(int t=0;t<latt4[3];t++){
-  for(int s=0;s<Ls;s++){
+    for(int s=0;s<Ls;s++){
-    std::vector<int> site({s,x,y,z,t});
+      std::vector<int> site({s,x,y,z,t});
-    SpinColourVector tmp;
+      SpinColourVector tmp;
-    peekSite(tmp,src,site);
+      peekSite(tmp,src,site);
-    pokeSite(tmp,ssrc,site);
+      pokeSite(tmp,ssrc,site);
-  }}}}}
+    }}}}}
-  double t0=usecond();
+    double t0=usecond();
-  sDw.Dhop(ssrc,sresult,0);
+    sDw.Dhop(ssrc,sresult,0);
-  double t1=usecond();
+    double t1=usecond();
 #ifdef TIMERS_OFF
-  int ncall =10;
+    int ncall =10;
 #else 
-  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 #endif
-  PerformanceCounter Counter(8);
+    PerformanceCounter Counter(8);
-  Counter.Start();
+    Counter.Start();
-  t0=usecond();
+    t0=usecond();
-  for(int i=0;i<ncall;i++){
+    for(int i=0;i<ncall;i++){
-    sDw.Dhop(ssrc,sresult,0);
+      sDw.Dhop(ssrc,sresult,0);
-  }
+    }
-  t1=usecond();
+    t1=usecond();
-  Counter.Stop();
+    Counter.Stop();
  if ( report ) {
    Counter.Report();
  } else { 
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<"\t"<< flops/(t1-t0);
  }
-  LatticeFermion sr_eo(sFGrid);
+    if ( report ) {
-  LatticeFermion serr(sFGrid);
+      Counter.Report();
-  
+    } else { 
-  LatticeFermion ssrc_e (sFrbGrid);
+
-  LatticeFermion ssrc_o (sFrbGrid);
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-  LatticeFermion sr_e   (sFrbGrid);
+      double flops=1344*volume*ncall;
-  LatticeFermion sr_o   (sFrbGrid);
+      std::cout<<"\t"<< flops/(t1-t0);
-      
+    }
-  pickCheckerboard(Even,ssrc_e,ssrc);
+
-  pickCheckerboard(Odd,ssrc_o,ssrc);
+
-  
+    LatticeFermion sr_eo(sFGrid);
-  setCheckerboard(sr_eo,ssrc_o);
+    LatticeFermion serr(sFGrid);
  setCheckerboard(sr_eo,ssrc_e);
-  sr_e = zero;
+    LatticeFermion ssrc_e (sFrbGrid);
-  sr_o = zero;
+    LatticeFermion ssrc_o (sFrbGrid);
-  
+    LatticeFermion sr_e   (sFrbGrid);
-  sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+    LatticeFermion sr_o   (sFrbGrid);
-  PerformanceCounter CounterSdw(8);
+      
-  CounterSdw.Start();
+    pickCheckerboard(Even,ssrc_e,ssrc);
-  t0=usecond();
+    pickCheckerboard(Odd,ssrc_o,ssrc);
  for(int i=0;i<ncall;i++){
    __SSC_START;
    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
    __SSC_STOP;
  }
  t1=usecond();
  CounterSdw.Stop();
-  if ( report ) { 
+    setCheckerboard(sr_eo,ssrc_o);
-    CounterSdw.Report();
+    setCheckerboard(sr_eo,ssrc_e);
-  } else {
+    
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    sr_e = zero;
-    double flops=(1344.0*volume*ncall)/2;
+    sr_o = zero;
-    std::cout<<"\t"<< flops/(t1-t0);
+    
-  }
+    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
    PerformanceCounter CounterSdw(8);
    CounterSdw.Start();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      __SSC_STOP;
    }
    t1=usecond();
    CounterSdw.Stop();
    if ( report ) { 
      CounterSdw.Report();
    } else {
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<"\t"<< flops/(t1-t0);
    }
 }
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -26,7 +26,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@ -1,117 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_wilson.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Richard Rollins <rprollins@users.noreply.github.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
 Gamma::GammaMatrix Gmu [] = {
  Gamma::GammaX,
  Gamma::GammaY,
  Gamma::GammaZ,
  Gamma::GammaT
 };
 bool overlapComms = false;
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
  typename WilsonFermionR::ImplParams params;
  params.overlapCommsCompute = overlapComms;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> seeds({1,2,3,4});
  RealD mass = 0.1;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  int Lmax = 32;
  int dmin = 0;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
  for (int L=8; L<=Lmax; L*=2)
    {
      std::vector<int> latt_size = std::vector<int>(4,L);
      for(int d=4; d>dmin; d--)
 	{
 	  if ( d<=3 ) { latt_size[d] *= 2; }
 	  std::cout << GridLogMessage;
 	  std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
 	  std::cout << latt_size.back() << "\t\t";
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 	  LatticeFermion    src(&Grid); random(pRNG,src);
 	  LatticeFermion result(&Grid); result=zero;
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
 	}
    }
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  Grid_finalize();
 }
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag )
 {
  int ncall    = 1000;
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
  double t1    = usecond();
  double flops = 1344 * volume * ncall;
  std::cout << flops/(t1-t0) << "\t\t";
 }
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@ -25,7 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 #include <PerfCount.h>
 using namespace Grid;
@ -40,20 +41,14 @@ int main(int argc,char **argv)
  std::ofstream os("zmm.dat");
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
  std::cout << GridLogMessage <<"\t";
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
-	std::cout << Ls<<"\t\t";
+	std::cout << Ls<<std::endl;
 	bench(os,grid,Ls);
      }
    }
@ -110,6 +105,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@ -121,7 +117,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  double flops=1344*volume/2;
  mfc = flops*ncall/(t1-t0);
-  std::cout<<mfc<<"\t\t";
+  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
@ -130,7 +126,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
-  std::cout<<mfa<<"\t\t";
+  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
  /*
  int dag=DaggerNo;
  t0=usecond();
@ -168,7 +164,8 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  //resulta = (-0.5) * resulta;
  diff = resulto-resulta;
-  std::cout<<norm2(diff)<<std::endl;
+  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
  std::cout<<std::endl;
  return 0;
 }
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@ -0,0 +1,39 @@
 bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 Benchmark_comms_SOURCES=Benchmark_comms.cc
 Benchmark_comms_LDADD=-lGrid
 Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid
 Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 Benchmark_dwf_ntpf_LDADD=-lGrid
 Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
 Benchmark_dwf_sweep_LDADD=-lGrid
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid
 Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
 Benchmark_memory_bandwidth_LDADD=-lGrid
 Benchmark_su3_SOURCES=Benchmark_su3.cc
 Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid
 Benchmark_zmm_SOURCES=Benchmark_zmm.cc
 Benchmark_zmm_LDADD=-lGrid
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1 +1,8 @@
 # additional include paths necessary to compile the C++ library
 AM_CXXFLAGS = -I$(top_srcdir)/lib
 AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
 include Make.inc
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -1,19 +0,0 @@
 #!/usr/bin/env bash
 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
 echo "-- copying fftw prototypes..."
 wget ${FFTW_URL}
 ./scripts/update_fftw.sh `basename ${FFTW_URL}`
 rm `basename ${FFTW_URL}`
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
 autoreconf -fvi
--- a/configure.ac
+++ b/configure.ac
@ -1,366 +1,315 @@
 #                         -*- Autoconf -*-
 # Process this file with autoconf to produce a configure script.
 #
 # Project Grid package  
 # 
 # Time-stamp: <2015-07-10 17:46:21 neo>
 AC_PREREQ([2.63])
-AC_INIT([Grid], [0.5.1-dev], [https://github.com/paboyle/Grid], [Grid])
+AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
-AC_CANONICAL_BUILD
+AC_CANONICAL_SYSTEM
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 AC_MSG_NOTICE([
-############### Checks for programs
+:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 Configuring $PACKAGE v$VERSION  for $host
 :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 ])
 # Checks for programs.
 AC_LANG(C++)
 CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 AC_PROG_RANLIB
 ############ openmp  ###############
 AC_OPENMP
 AC_PROG_RANLIB
 #AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 AX_EXT
-ac_openmp=no
+# Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
-if test "${OPENMP_CXXFLAGS}X" != "X"; then
+# Checks for header files.
 ac_openmp=yes
 AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
 AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
 fi
 ############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
 AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
-############### Checks for typedefs, structures, and compiler characteristics
+# Checks for typedefs, structures, and compiler characteristics.
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
-############### GMP and MPFR #################
+# Checks for library functions.
-AC_ARG_WITH([gmp],
+echo
-    [AS_HELP_STRING([--with-gmp=prefix],
+echo Checking libraries 
-    [try this for a non-standard install prefix of the GMP library])],
+echo :::::::::::::::::::::::::::::::::::::::::::
    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
 AC_ARG_WITH([mpfr],
    [AS_HELP_STRING([--with-mpfr=prefix],
    [try this for a non-standard install prefix of the MPFR library])],
    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
 ################## lapack ####################
 AC_ARG_ENABLE([lapack],
    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
 case ${ac_LAPACK} in
    no)
        ;;
    yes)
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
    *)
        AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
        AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac
 ################## first-touch ####################
 AC_ARG_ENABLE([numa],
    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
 case ${ac_NUMA} in
    no)
        ;;
    yes)
        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
    *)
        AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
 esac
 ################## FFTW3 ####################
 AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
            [try this for a non-standard install prefix of the FFTW3 library])],
            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
 ################ Get compiler informations
 AC_LANG([C++])
 AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
 AX_COMPILER_VENDOR
 AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
      [vendor of C++ compiler that will compile the code])
 AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
 ############### Checks for library functions
 CXXFLAGS_CPY=$CXXFLAGS
 LDFLAGS_CPY=$LDFLAGS
 CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
 AC_CHECK_LIB([gmp],[__gmpf_init],
             [AC_CHECK_LIB([mpfr],[mpfr_init],
                 [AC_DEFINE([HAVE_LIBMPFR], [1], [Define to 1 if you have the `MPFR' library (-lmpfr).])]
                 [have_mpfr=true]
                 [LIBS="$LIBS -lmpfr"],
                 [AC_MSG_ERROR([MPFR library not found])])]
   	     [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
             [have_gmp=true]
             [LIBS="$LIBS -lgmp"],
             [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])
-if test "${ac_LAPACK}x" != "nox"; then
+#AC_CHECK_LIB([gmp],[__gmpf_init],,
-    AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
+#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-                 [AC_MSG_ERROR("LAPACK enabled but library not found")])
+#Please install or provide the correct path to your installation
-fi
+#Info at: http://www.gmplib.org)])
 AC_CHECK_LIB([fftw3],[fftw_execute],
  [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])]
  [have_fftw=true]
  [LIBS="$LIBS -lfftw3 -lfftw3f"],
  [AC_MSG_WARN([**** FFTW library not found, Grid can still compile but FFT-based routines will not work ****])])
 CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY
-############### SIMD instruction selection
+#AC_CHECK_LIB([mpfr],[mpfr_init],,
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVXFMA|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
+#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
 #Please install or provide the correct path to your installation
 #Info at: http://www.mpfr.org/)])
 #
 # SIMD instructions selection
 #
 AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
-	[ac_SIMD=${enable_simd}],[ac_SIMD=GEN])
+	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
-case ${ax_cv_cxx_compiler_vendor} in
+supported=no
-  clang|gnu)
+
-    case ${ac_SIMD} in
+ac_ZMM=no;
      SSE4)
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
        SIMD_FLAGS='-msse4.2';;
      AVX)
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
        SIMD_FLAGS='-mavx';;
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
        SIMD_FLAGS='-mavx -mfma4';;
      AVXFMA)
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-mavx2 -mfma';;
      AVX512|AVX512MIC|KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
      IMCI|KNC)
        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
    esac;;
  intel)
    case ${ac_SIMD} in
      SSE4)
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
        SIMD_FLAGS='-msse4.2 -xsse4.2';;
      AVX)
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
        SIMD_FLAGS='-mavx -xavx';;
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
        SIMD_FLAGS='-mavx -mfma';;
      AVXFMA)
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-xcore-avx512';;
      AVX512MIC|KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        SIMD_FLAGS='-xmic-avx512';;
      IMCI|KNC)
        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
    esac;;
  *)
    AC_MSG_WARN([Compiler unknown, using generic vector code])
    AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
 esac
 AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
 AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
 case ${ac_SIMD} in
-  AVX512|AVX512MIC|KNL)
+     SSE4)
-    AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
+       echo Configuring for SSE4
-  *)
+       AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
-	;;
+       if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
         supported=yes
       else
  	AC_MSG_WARN([Your processor does not support SSE4 instructions])
       fi
     ;;
     AVX)
       echo Configuring for AVX
       AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
       supported=yes			  
       else
       	AC_MSG_WARN([Your processor does not support AVX instructions])
       fi
     ;;
     AVXFMA4)
       echo Configuring for AVX
       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
       supported=yes			  
       else
       	AC_MSG_WARN([Your processor does not support AVX instructions])
       fi
     ;;
     AVX2)
       echo Configuring for AVX2
       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
       if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
       supported=yes
       else
       AC_MSG_WARN([Your processor does not support AVX2 instructions])
       fi
     ;;
     AVX512)
       echo Configuring for AVX512 
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
       supported="cross compilation"
       ac_ZMM=yes;
     ;;
     IMCI)
       echo Configuring for IMCI
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
       supported="cross compilation"
       ac_ZMM=no;
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support 
       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
       supported="cross compilation"
     ;;
     DEBUG)
       echo Configuring without SIMD support - only for compiler DEBUGGING!
       AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
      ;;     
     *)
     AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]); 
     ;;
 esac
-############### precision selection
+case ${ac_ZMM} in
 yes)
 	echo Enabling ZMM source code
 ;;
 no)
 	echo Disabling ZMM source code
 ;;
 esac
 AM_CONDITIONAL(BUILD_ZMM,[ test "X${ac_ZMM}X" == "XyesX" ])
 AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
 case ${ac_PRECISION} in
     single)
       echo default precision is single
       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
     ;;
     double)
       echo default precision is double
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
 esac
-############### communication type selection
+#
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|shmem],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+# Comms selection
 #
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 case ${ac_COMMS} in
     none)
       echo Configuring for NO communications
       AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
     ;;
     mpi-auto)
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
       LX_FIND_MPI
       if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
       AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
       AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
       AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
       LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS"
     ;;
     mpi)
       echo Configuring for MPI communications
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
     mpi3)
       AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
     ;;
     shmem)
       echo Configuring for SHMEM communications
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
-############### RNG selection
+#
 # RNG selection
 #
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
 	[Select Random Number Generator to be used])],\
 	[ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
 case ${ac_RNG} in
     ranlux48)
-      AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
+     AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
     ;;
     mt19937)
-      AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
+     AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
     ;;
     *)
-      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
+     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
-############### timer option
+#
-AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
+# SDE timing mode
 #
 AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
 	[Enable system dependent high res timers])],\
 	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 case ${ac_TIMERS} in
     yes)
-      AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
-      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
+     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac
-############### Chroma regression test
+#
 # Chroma regression tests
 #
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
 case ${ac_CHROMA} in
-     yes|no)
+     yes)
       echo Enabling tests regressing to Chroma
     ;;
     no)
       echo Disabling tests regressing to Chroma
     ;;
     *)
-       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
+     AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
-############### Doxygen
+#
-AC_PROG_DOXYGEN
+# Lapack
 #
 AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
-if test -n "$DOXYGEN"
+case ${ac_LAPACK} in
-then
+     yes)
-AC_CONFIG_FILES([docs/doxy.cfg])
+       echo Enabling lapack
-fi
+     ;;
     no)
       echo Disabling lapack
     ;;
     *)
       echo Enabling lapack at ${ac_LAPACK}
     ;;
 esac
-############### Ouput
+AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
-cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
-AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
+
-AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
+###################################################################
-AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
+# Checks for doxygen support
-AC_SUBST([AM_CFLAGS])
+# if present enables the "make doxyfile" command
-AC_SUBST([AM_CXXFLAGS])
+#echo
-AC_SUBST([AM_LDFLAGS])
+#echo Checking doxygen support 
 #echo :::::::::::::::::::::::::::::::::::::::::::
 #AC_PROG_DOXYGEN
 #if test -n "$DOXYGEN"
 #then
 #AC_CONFIG_FILES([docs/doxy.cfg])
 #fi
 echo
 echo Creating configuration files
 echo :::::::::::::::::::::::::::::::::::::::::::
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
 AC_CONFIG_FILES(tests/IO/Makefile)
 AC_CONFIG_FILES(tests/core/Makefile)
 AC_CONFIG_FILES(tests/debug/Makefile)
 AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_OUTPUT
 echo "
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
----- PLATFORM ----------------------------------------
+The following features are enabled:
 - architecture (build)          : $build_cpu
 - os (build)                    : $build_os
 - architecture (target)         : $target_cpu
 - os (target)                   : $target_os
 - compiler vendor               : ${ax_cv_cxx_compiler_vendor}
 - compiler version              : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 - SIMD                          : ${ac_SIMD}
 - Threading                     : ${ac_openmp} 
 - Communications type           : ${ac_COMMS}
 - Default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
 - GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 - LAPACK                        : ${ac_LAPACK}
 - FFTW                          : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
----- BUILD FLAGS -------------------------------------
+- Supported SIMD flags          : $SIMD_FLAGS
- CXXFLAGS:
+----------------------------------------------------------
-`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
+- enabled simd support          : ${ac_SIMD}   (config macro says supported: $supported )
- LDFLAGS:
+- communications type           : ${ac_COMMS}
-`echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
+- default precision             : ${ac_PRECISION}
- LIBS:
+- RNG choice                    : ${ac_RNG} 
-`echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
+- LAPACK	                : ${ac_LAPACK} 
-------------------------------------------------------
+
 "
--- a/include/Grid
+++ b/include/Grid
@ -1 +0,0 @@
 ../lib
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@ -29,28 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_ALGORITHMS_H
 #define GRID_ALGORITHMS_H
-#include <Grid/algorithms/SparseMatrix.h>
+#include <algorithms/SparseMatrix.h>
-#include <Grid/algorithms/LinearOperator.h>
+#include <algorithms/LinearOperator.h>
-#include <Grid/algorithms/Preconditioner.h>
+#include <algorithms/Preconditioner.h>
-#include <Grid/algorithms/approx/Zolotarev.h>
+#include <algorithms/approx/Zolotarev.h>
-#include <Grid/algorithms/approx/Chebyshev.h>
+#include <algorithms/approx/Chebyshev.h>
-#include <Grid/algorithms/approx/Remez.h>
+#include <algorithms/approx/Remez.h>
-#include <Grid/algorithms/approx/MultiShiftFunction.h>
+#include <algorithms/approx/MultiShiftFunction.h>
-#include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <algorithms/iterative/ConjugateGradient.h>
-#include <Grid/algorithms/iterative/ConjugateResidual.h>
+#include <algorithms/iterative/ConjugateResidual.h>
-#include <Grid/algorithms/iterative/NormalEquations.h>
+#include <algorithms/iterative/NormalEquations.h>
-#include <Grid/algorithms/iterative/SchurRedBlack.h>
+#include <algorithms/iterative/SchurRedBlack.h>
-#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
+#include <algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 // Lanczos support
-#include <Grid/algorithms/iterative/MatrixUtils.h>
+#include <algorithms/iterative/MatrixUtils.h>
-#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <Grid/algorithms/CoarsenedMatrix.h>
+#include <algorithms/CoarsenedMatrix.h>
 // Eigen/lanczos
 // EigCg
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@ -40,6 +40,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <mm_malloc.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #endif
 namespace Grid {
 ////////////////////////////////////////////////////////////////////
@ -57,85 +65,28 @@ public:
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
  alignedAllocator() throw() { }
  alignedAllocator(const alignedAllocator&) throw() { }
  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
  ~alignedAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  //  const_pointer address(const_reference __x) const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
    _Tp tmp;
 #ifdef GRID_NUMA
 #pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 //////////////////////////////////////////////////////////////////////////////////////////
 // MPI3 : comms must use shm region
 // SHMEM: comms must use symmetric heap
 //////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #define PARANOID_SYMMETRIC_HEAP
 #endif
 template<typename _Tp>
 class commAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
  commAllocator() throw() { }
  commAllocator(const commAllocator&) throw() { }
  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
  ~commAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
 #ifdef CRAY
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
-#else
+
-    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
+
-#endif
+#define PARANOID_SYMMETRIC_HEAP
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
@ -145,47 +96,55 @@ public:
    if ( bcast != ptr ) {
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
-      //      BACKTRACEFILE();
+      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
    shmem_free((void *)__p);
  }
 #else
-  pointer allocate(size_type __n, const void* _p= 0) 
+
  {
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
 #endif
    _Tp tmp;
 #undef FIRST_TOUCH_OPTIMISE
 #ifdef FIRST_TOUCH_OPTIMISE
 #pragma omp parallel for 
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef GRID_COMMS_SHMEM
    shmem_free((void *)__p);
 #else
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
  }
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
-////////////////////////////////////////////////////////////////////////////////
+template<typename _Tp>  inline bool
-// Template typedefs
+operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
-////////////////////////////////////////////////////////////////////////////////
+
-template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
+template<typename _Tp>  inline bool
-template<class T> using commVector = std::vector<T,commAllocator<T> >;              
+operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 }; // namespace Grid
 #endif
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H
-#include <Grid/cartesian/Cartesian_base.h>
+#include <cartesian/Cartesian_base.h>
-#include <Grid/cartesian/Cartesian_full.h>
+#include <cartesian/Cartesian_full.h>
-#include <Grid/cartesian/Cartesian_red_black.h> 
+#include <cartesian/Cartesian_red_black.h> 
 #endif
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_COMMUNICATOR_H
 #define GRID_COMMUNICATOR_H
-#include <Grid/communicator/Communicator_base.h>
+#include <communicator/Communicator_base.h>
 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@ -28,21 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_CSHIFT_H_
 #define _GRID_CSHIFT_H_
-#include <Grid/cshift/Cshift_common.h>
+#include <cshift/Cshift_common.h>
 #ifdef GRID_COMMS_NONE
-#include <Grid/cshift/Cshift_none.h>
+#include <cshift/Cshift_none.h>
 #endif
 #ifdef GRID_COMMS_MPI
-#include <Grid/cshift/Cshift_mpi.h>
+#include <cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
-#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
+#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/FFT.h
+++ b/lib/FFT.h
@ -1,271 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cshift.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 #ifdef HAVE_FFTW	
 #include <fftw3.h>
 #endif
 namespace Grid {
  template<class scalar> struct FFTW { };
 #ifdef HAVE_FFTW	
  template<> struct FFTW<ComplexD> {
  public:
    typedef fftw_complex FFTW_scalar;
    typedef fftw_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftw_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftw_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftw_destroy_plan(p);
    }
  };
  template<> struct FFTW<ComplexF> {
  public:
    typedef fftwf_complex FFTW_scalar;
    typedef fftwf_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftwf_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftwf_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftwf_destroy_plan(p);
    }
  };
 #endif
 #ifndef FFTW_FORWARD
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #endif
  class FFT { 
  private:
    GridCartesian *vgrid;
    GridCartesian *sgrid;
    int Nd;
    double flops;
    double flops_call;
    uint64_t usec;
    std::vector<int> dimensions;
    std::vector<int> processors;
    std::vector<int> processor_coor;
  public:
    static const int forward=FFTW_FORWARD;
    static const int backward=FFTW_BACKWARD;
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
    FFT ( GridCartesian * grid ) : 
      vgrid(grid),
      Nd(grid->_ndimension),
      dimensions(grid->_fdimensions),
      processors(grid->_processors),
      processor_coor(grid->_processor_coor)
    {
      flops=0;
      usec =0;
      std::vector<int> layout(Nd,1);
      sgrid = new GridCartesian(dimensions,layout,processors);
    };
    ~FFT ( void)  { 
      delete sgrid; 
    }
    template<class vobj>
    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
      conformable(result._grid,vgrid);
      conformable(source._grid,vgrid);
      int L = vgrid->_ldimensions[dim];
      int G = vgrid->_fdimensions[dim];
      std::vector<int> layout(Nd,1);
      std::vector<int> pencil_gd(vgrid->_fdimensions);
      pencil_gd[dim] = G*processors[dim];    
      // Pencil global vol LxLxGxLxL per node
      GridCartesian pencil_g(pencil_gd,layout,processors);
      // Construct pencils
      typedef typename vobj::scalar_object sobj;
      typedef typename sobj::scalar_type   scalar;
      Lattice<vobj> ssource(vgrid); ssource =source;
      Lattice<sobj> pgsource(&pencil_g);
      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
 #ifndef HAVE_FFTW	
      assert(0);
 #else 
      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      {
 	int Ncomp = sizeof(sobj)/sizeof(scalar);
 	int Nlow  = 1;
 	for(int d=0;d<dim;d++){
 	  Nlow*=vgrid->_ldimensions[d];
 	}
 	int rank = 1;  /* 1d transforms */
 	int n[] = {G}; /* 1d transforms of length G */
 	int howmany = Ncomp;
 	int odist,idist,istride,ostride;
 	idist   = odist   = 1;          /* Distance between consecutive FT's */
 	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
 	int *inembed = n, *onembed = n;
 	int sign = FFTW_FORWARD;
 	if (inverse) sign = FFTW_BACKWARD;
 	FFTW_plan p;
 	{
 	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
 	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
 	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
 					       in,inembed,
 					       istride,idist,
 					       out,onembed,
 					       ostride, odist,
 					       sign,FFTW_ESTIMATE);
 	}
    std::vector<int> lcoor(Nd), gcoor(Nd);
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
 	  for(int idx=0;idx<sgrid->lSites();idx++) { 
    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
 	    sobj s;
 	    peekLocalSite(s,ssource,lcoor);
 	    lcoor[dim]+=p*L;
 	    pokeLocalSite(s,pgsource,lcoor);
 	  }
 	  ssource = Cshift(ssource,dim,L);
 	}
 	// Loop over orthog coords
 	int NN=pencil_g.lSites();
 	GridStopWatch timer;
 	timer.Start();
 PARALLEL_FOR_LOOP
 	for(int idx=0;idx<NN;idx++) {
 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
 	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
 	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
 	    FFTW<scalar>::fftw_execute_dft(p,in,out);
 	  }
 	}
        timer.Stop();
          double add,mul,fma;
          FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
          flops_call = add+mul+2.0*fma;
          usec += timer.useconds();
          flops+= flops_call*NN;
        int pc = processor_coor[dim];
        for(int idx=0;idx<sgrid->lSites();idx++) {
 	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
 	  gcoor = lcoor;
 	  // extract the result
 	  sobj s;
 	  gcoor[dim] = lcoor[dim]+L*pc;
 	  peekLocalSite(s,pgresult,gcoor);
 	  pokeLocalSite(s,result,lcoor);
 	}
 	FFTW<scalar>::fftw_destroy_plan(p);
      }
 #endif
    }
  };
 }
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -59,31 +59,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <Grid/serialisation/Serialisation.h>
+#include <serialisation/Serialisation.h>
-#include "Config.h"
+#include <Config.h>
-#include <Grid/Timer.h>
+#include <Timer.h>
-#include <Grid/PerfCount.h>
+#include <PerfCount.h>
-#include <Grid/Log.h>
+#include <Log.h>
-#include <Grid/AlignedAllocator.h>
+#include <AlignedAllocator.h>
-#include <Grid/Simd.h>
+#include <Simd.h>
-#include <Grid/Threads.h>
+#include <Threads.h>
-#include <Grid/Lexicographic.h>
+#include <Lexicographic.h>
-#include <Grid/Init.h>
+#include <Communicator.h> 
-#include <Grid/Communicator.h> 
+#include <Cartesian.h>    
-#include <Grid/Cartesian.h>    
+#include <Tensors.h>      
-#include <Grid/Tensors.h>      
+#include <Lattice.h>      
-#include <Grid/Lattice.h>      
+#include <Cshift.h>       
-#include <Grid/Cshift.h>       
+#include <Stencil.h>      
-#include <Grid/Stencil.h>      
+#include <Algorithms.h>   
-#include <Grid/Algorithms.h>   
+#include <parallelIO/BinaryIO.h>
-#include <Grid/parallelIO/BinaryIO.h>
+#include <qcd/QCD.h>
-#include <Grid/qcd/QCD.h>
+#include <parallelIO/NerscIO.h>
-#include <Grid/parallelIO/NerscIO.h>
+#include <Init.h>
-#include <Grid/FFT.h>
+#include <qcd/hmc/NerscCheckpointer.h>
-
+#include <qcd/hmc/HmcRunner.h>
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@ -153,7 +153,6 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
    std::vector<int> cores(0);
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@ -171,17 +170,14 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 //
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;
 void Grid_init(int *argc,char ***argv)
 {
  GridLogger::StopWatch.Start();
  CartesianCommunicator::Init(argc,argv);
  // Parse command line args.
  GridLogger::StopWatch.Start();
  std::string arg;
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
@ -197,7 +193,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;
    exit(EXIT_SUCCESS);
  }
@ -207,6 +203,7 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@ -219,14 +216,11 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
    GridLogTimestamp(1);
  }
  GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
@ -240,34 +234,26 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }
  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
  std::cout <<std::endl;
-  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<Logger::RED  << "__|__|__|__|__"<<             "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
+  std::cout <<Logger::RED  << "__|__|  |  |  "<<             "|  |  | "<<Logger::PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
-  std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<Logger::RED  << "__|__         "<<             "        "<<Logger::PURPLE<<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
+  std::cout <<Logger::RED  << "__|_  "<<Logger::GREEN<<"G       "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::PURPLE<<"    _|__"<<std::endl;
-  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G  GG   "<<Logger::RED<<" RRRR   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D    D"<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<"G   G   "<<Logger::RED<<" R  R   "<<Logger::BLUE  <<"  I     "<<Logger::PURPLE<<"D   D "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
+  std::cout <<Logger::BLUE << "__|_  "<<Logger::GREEN<<" GGGG   "<<Logger::RED<<" R   R  "<<Logger::BLUE  <<" III    "<<Logger::PURPLE<<"DDDD  "<<Logger::GREEN <<"    _|__"<<std::endl;
-  std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
+  std::cout <<Logger::BLUE << "__|__         "<<             "        "<<Logger::GREEN <<"        "<<                "          _|__"<<std::endl; 
-  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
+  std::cout <<Logger::BLUE << "__|__|__|__|__"<<             "|__|__|_"<<Logger::GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
-  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
+  std::cout <<Logger::BLUE << "  |  |  |  |  "<<             "|  |  | "<<Logger::GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
-  std::cout <<COL_YELLOW<< std::endl;
+  std::cout <<Logger::YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
  std::cout << "Colours by Tadahito Boyle "<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
@ -278,16 +264,13 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
-  std::cout << COL_BACKGROUND <<std::endl;
+  std::cout << Logger::BLACK <<std::endl;
  std::cout << std::endl;
  Grid_is_initialised = 1;
 }
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#ifdef GRID_COMMS_MPI
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/lib/Init.h
+++ b/lib/Init.h
@ -33,7 +33,6 @@ namespace Grid {
  void Grid_init(int *argc,char ***argv);
  void Grid_finalize(void);
  // internal, controled with --handle
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
@ -45,7 +44,6 @@ namespace Grid {
  const std::vector<int> &GridDefaultMpi(void);
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  void GridLogTimestamp(int);
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
--- a/lib/Lattice.h
+++ b/lib/Lattice.h
@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H
-#include <Grid/lattice/Lattice_base.h>
+#include <lattice/Lattice_base.h>
 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@ -1,97 +1,126 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/Log.cc
+    Source file: ./lib/Log.cc
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 GridStopWatch Logger::StopWatch;
-int Logger::timestamp;
+std::ostream  Logger::devnull(0);
-std::ostream Logger::devnull(0);
+std::string Logger::BLACK("\033[30m");
 std::string Logger::RED("\033[31m");
 std::string Logger::GREEN("\033[32m");
 std::string Logger::YELLOW("\033[33m");
 std::string Logger::BLUE("\033[34m");
 std::string Logger::PURPLE("\033[35m");
 std::string Logger::CYAN("\033[36m");
 std::string Logger::WHITE("\033[37m");
 std::string Logger::NORMAL("\033[0;39m");
 std::string EMPTY("");
-void GridLogTimestamp(int on){
+#if 0  
-  Logger::Timestamp(on);
+  GridLogger GridLogError      (1,"Error",Logger::RED);
-}
+  GridLogger GridLogWarning    (1,"Warning",Logger::YELLOW);
  GridLogger GridLogMessage    (1,"Message",Logger::BLACK);
  GridLogger GridLogDebug      (1,"Debug",Logger::PURPLE);
  GridLogger GridLogPerformance(1,"Performance",Logger::GREEN);
  GridLogger GridLogIterative  (1,"Iterative",Logger::BLUE);
  GridLogger GridLogIntegrator (1,"Integrator",Logger::BLUE);
 #else
  GridLogger GridLogError      (1,"Error",EMPTY);
  GridLogger GridLogWarning    (1,"Warning",EMPTY);
  GridLogger GridLogMessage    (1,"Message",EMPTY);
  GridLogger GridLogDebug      (1,"Debug",EMPTY);
  GridLogger GridLogPerformance(1,"Performance",EMPTY);
  GridLogger GridLogIterative  (1,"Iterative",EMPTY);
  GridLogger GridLogIntegrator (1,"Integrator",EMPTY);
 #endif
-Colours GridLogColours(0);
+void GridLogConfigure(std::vector<std::string> &logstreams)
-GridLogger GridLogError(1, "Error", GridLogColours, "RED");
+{
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
 GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
 GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
 GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
 GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
 GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
 void GridLogConfigure(std::vector<std::string> &logstreams) {
  GridLogError.Active(0);
  GridLogWarning.Active(0);
-  GridLogMessage.Active(1); // at least the messages should be always on
+  GridLogMessage.Active(0);
  GridLogIterative.Active(0);
  GridLogDebug.Active(0);
  GridLogPerformance.Active(0);
  GridLogIntegrator.Active(0);
  GridLogColours.Active(0);
-  for (int i = 0; i < logstreams.size(); i++) {
+  int blackAndWhite = 1;
-    if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
+  if(blackAndWhite){
-    if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
+    Logger::BLACK = std::string("");
-    if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
+    Logger::RED    =Logger::BLACK;
-    if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
+    Logger::GREEN  =Logger::BLACK;
-    if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
+    Logger::YELLOW =Logger::BLACK;
-    if (logstreams[i] == std::string("Performance"))
+    Logger::BLUE   =Logger::BLACK;
-      GridLogPerformance.Active(1);
+    Logger::PURPLE =Logger::BLACK;
-    if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
+    Logger::CYAN   =Logger::BLACK;
-    if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
+    Logger::WHITE  =Logger::BLACK;
    Logger::NORMAL =Logger::BLACK;
  }
  for(int i=0;i<logstreams.size();i++){
    if ( logstreams[i]== std::string("Error")       ) GridLogError.Active(1);
    if ( logstreams[i]== std::string("Warning")     ) GridLogWarning.Active(1);
    if ( logstreams[i]== std::string("Message")     ) GridLogMessage.Active(1);
    if ( logstreams[i]== std::string("Iterative")   ) GridLogIterative.Active(1);
    if ( logstreams[i]== std::string("Debug")       ) GridLogDebug.Active(1);
    if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1);
    if ( logstreams[i]== std::string("Integrator" ) ) GridLogIntegrator.Active(1);
  }
 }
 ////////////////////////////////////////////////////////////
 // Verbose limiter on MPI tasks
 ////////////////////////////////////////////////////////////
-void Grid_quiesce_nodes(void) {
+void Grid_quiesce_nodes(void)
-  int me = 0;
+{
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
+  int me=0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+#ifdef GRID_COMMS_MPI
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
-  if (me) {
+  if ( me ) { 
    std::cout.setstate(std::ios::badbit);
  }
 }
-void Grid_unquiesce_nodes(void) {
+void Grid_unquiesce_nodes(void)
 {
 #ifdef GRID_COMMS_MPI
-  std::cout.clear();
+    std::cout.clear();
 #endif
 }
 }
--- a/lib/Log.h
+++ b/lib/Log.h
@ -6,9 +6,9 @@
    Copyright (C) 2015
-    Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
-    Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -27,9 +27,6 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <map>
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
@ -39,98 +36,54 @@
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Dress the output; use std::chrono for time stamping via the StopWatch class
-//////////////////////////////////////////////////////////////////////////////////////////////////
+int Rank(void); // used for early stage debug before library init
 class Colours{
 protected:
  bool is_active;
 public:
  std::map<std::string, std::string> colour;
  Colours(bool activate=false){
    Active(activate);
  };
  void Active(bool activate){
    is_active=activate;
    if (is_active){
     colour["BLACK"]  ="\033[30m";
     colour["RED"]    ="\033[31m";
     colour["GREEN"]  ="\033[32m";
     colour["YELLOW"] ="\033[33m";
     colour["BLUE"]   ="\033[34m";
     colour["PURPLE"] ="\033[35m";
     colour["CYAN"]   ="\033[36m";
     colour["WHITE"]  ="\033[37m";
     colour["NORMAL"] ="\033[0;39m";
    } else {
      colour["BLACK"] ="";
      colour["RED"]   ="";
      colour["GREEN"] ="";
      colour["YELLOW"]="";
      colour["BLUE"]  ="";
      colour["PURPLE"]="";
      colour["CYAN"]  ="";
      colour["WHITE"] ="";
      colour["NORMAL"]="";
    }
  };
 };
 class Logger {
 protected:
-  Colours &Painter;
+    int active;
-  int active;
+    std::string name, topName, COLOUR;
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
 public:
-  static GridStopWatch StopWatch;
+    static GridStopWatch StopWatch;
-  static std::ostream devnull;
+    static std::ostream devnull;
-  std::string background() {return Painter.colour["NORMAL"];}
+    static std::string BLACK;
-  std::string evidence() {return Painter.colour["YELLOW"];}
+    static std::string RED  ;
-  std::string colour() {return Painter.colour[COLOUR];}
+    static std::string GREEN;
-
+    static std::string YELLOW;
-  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
+    static std::string BLUE  ;
-    name(nm),
+    static std::string PURPLE;
-    topName(topNm),
+    static std::string CYAN  ;
-    Painter(col_class),
+    static std::string WHITE ;
-    COLOUR(col) {} ;
+    static std::string NORMAL;
-  
+    
-  void Active(int on) {active = on;};
+ Logger(std::string topNm, int on, std::string nm,std::string col)
-  int  isActive(void) {return active;};
+   : active(on), name(nm), topName(topNm), COLOUR(col) {};
-  static void Timestamp(int on) {timestamp = on;};
+    
-  
+    void Active(int on) {active = on;};
-  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
+    int  isActive(void) {return active;};
-
+    
-    if ( log.active ) {
+    friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
-      stream << log.background()<< log.topName << log.background()<< " : ";
+        if ( log.active ) {
-      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
+            StopWatch.Stop();
-      if ( log.timestamp ) {
+            GridTime now = StopWatch.Elapsed();
-	StopWatch.Stop();
+            StopWatch.Start();
-	GridTime now = StopWatch.Elapsed();
+            stream << BLACK <<std::setw(8) << std::left << log.topName << BLACK<< " : ";
-	StopWatch.Start();
+            stream << log.COLOUR <<std::setw(11)  << log.name << BLACK << " : ";
-	stream << log.evidence()<< now << log.background() << " : " ;
+            stream << YELLOW <<std::setw(6) << now <<BLACK << " : " ;
-      }
+            stream << log.COLOUR;
-      stream << log.colour();
+            return stream;
-      return stream;
+        } else { 
-    } else { 
+            return devnull;
-      return devnull;
+        }
    }
-  }
+    
 };
-
+    
 class GridLogger: public Logger {
 public:
-  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
+ GridLogger(int on, std::string nm, std::string col = Logger::BLACK): Logger("Grid", on, nm, col){};
  Logger("Grid", on, nm, col_class, col_key){};
 };
 void GridLogConfigure(std::vector<std::string> &logstreams);
@ -142,40 +95,38 @@ extern GridLogger GridLogDebug  ;
 extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;
 extern Colours    GridLogColours;
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
-char string[20];					\
+    char string[20];					\
-std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
+    std::sprintf(string,"backtrace.%d",Rank());				\
-std::FILE * fp = std::fopen(string,"w");				\
+    std::FILE * fp = std::fopen(string,"w");				\
-BACKTRACEFP(fp)\
+    BACKTRACEFP(fp)\
-std::fclose(fp);	    \
+    std::fclose(fp);	    \
 }
 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
-int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
-char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
-for (int i = 0; i < symbols; i++){\
+  for (int i = 0; i < symbols; i++){\
-  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+    std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
-}\
+  }\
 }
 #else 
 #define BACKTRACEFP(fp) { \
-std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+    std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
-std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+    std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
-std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+    std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
-std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+    std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif
 #define BACKTRACE() BACKTRACEFP(stdout) 
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -1,32 +1,32 @@
 # additional include paths necessary to compile the C++ library
 AM_CXXFLAGS = -I$(top_srcdir)/
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 #
 # Libraries
 #
 include Make.inc
 include Eigen.inc
 lib_LIBRARIES = libGrid.a
 libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
 #	qcd/action/fermion/PartialFractionFermion5D.cc\	\
 #
 # Include files
 #
 nobase_include_HEADERS=$(HFILES)
 libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
 libGrid_adir                   = $(pkgincludedir)
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Simd.h
+++ b/lib/Simd.h
@ -1,33 +1,32 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/Simd.h
+    Source file: ./lib/Simd.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #ifndef GRID_SIMD_H
 #define GRID_SIMD_H
@ -119,14 +118,6 @@ namespace Grid {
  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
  // define projections to real and imaginay parts
  inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
  inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
  inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
  inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
  // define auxiliary functions for complex computations
  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
@ -172,8 +163,8 @@ namespace Grid {
 };
-#include "simd/Grid_vector_types.h"
+#include <simd/Grid_vector_types.h>
-#include "simd/Grid_vector_unops.h"
+#include <simd/Grid_vector_unops.h>
 namespace Grid {
  // Default precision
--- a/lib/Stat.cc
+++ b/lib/Stat.cc
@ -1,247 +0,0 @@
 #include <Grid.h>
 #include <PerfCount.h>
 #include <Stat.h>
 namespace Grid { 
 bool PmuStat::pmu_initialized=false;
 void PmuStat::init(const char *regname)
 {
 #ifdef __x86_64__
  name = regname;
  if (!pmu_initialized)
    {
      std::cout<<"initialising pmu"<<std::endl;
      pmu_initialized = true;
      pmu_init();
    }
  clear();
 #endif
 }
 void PmuStat::clear(void)
 {
 #ifdef __x86_64__
  count = 0;
  tregion = 0;
  pmc0 = 0;
  pmc1 = 0;
  inst = 0;
  cyc = 0;
  ref = 0;
  tcycles = 0;
  reads = 0;
  writes = 0;
 #endif
 }
 void PmuStat::print(void)
 {
 #ifdef __x86_64__
  std::cout <<"Reg "<<std::string(name)<<":\n";
  std::cout <<"  region "<<tregion<<std::endl;
  std::cout <<"  cycles "<<tcycles<<std::endl;
  std::cout <<"  inst   "<<inst   <<std::endl;
  std::cout <<"  cyc    "<<cyc    <<std::endl;
  std::cout <<"  ref    "<<ref    <<std::endl;
  std::cout <<"  pmc0   "<<pmc0   <<std::endl;
  std::cout <<"  pmc1   "<<pmc1   <<std::endl;
  std::cout <<"  count  "<<count  <<std::endl;
  std::cout <<"  reads  "<<reads  <<std::endl;
  std::cout <<"  writes "<<writes <<std::endl;
 #endif
 }
 void PmuStat::start(void)
 {
 #ifdef __x86_64__
  pmu_start();
  ++count;
  xmemctrs(&mrstart, &mwstart);
  tstart = __rdtsc();
 #endif
 }
 void PmuStat::enter(int t)
 {
 #ifdef __x86_64__
  counters[0][t] = __rdpmc(0);
  counters[1][t] = __rdpmc(1);
  counters[2][t] = __rdpmc((1<<30)|0);
  counters[3][t] = __rdpmc((1<<30)|1);
  counters[4][t] = __rdpmc((1<<30)|2);
  counters[5][t] = __rdtsc();
 #endif
 }
 void PmuStat::exit(int t)
 {
 #ifdef __x86_64__
  counters[0][t] = __rdpmc(0) - counters[0][t];
  counters[1][t] = __rdpmc(1) - counters[1][t];
  counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
  counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
  counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
  counters[5][t] = __rdtsc() - counters[5][t];
 #endif
 }
 void PmuStat::accum(int nthreads)
 {
 #ifdef __x86_64__
  tend = __rdtsc();
  xmemctrs(&mrend, &mwend);
  pmu_stop();
  for (int t = 0; t < nthreads; ++t) {
    pmc0 += counters[0][t];
    pmc1 += counters[1][t];
    inst += counters[2][t];
    cyc += counters[3][t];
    ref += counters[4][t];
    tcycles += counters[5][t];
  }
  uint64_t region = tend - tstart;
  tregion += region;
  uint64_t mreads = mrend - mrstart;
  reads += mreads;
  uint64_t mwrites = mwend - mwstart;
  writes += mwrites;
 #endif
 }
 void PmuStat::pmu_fini(void) {}
 void PmuStat::pmu_start(void) {};
 void PmuStat::pmu_stop(void) {};
 void PmuStat::pmu_init(void)
 {
 #ifdef _KNIGHTS_LANDING_
  KNLsetup();
 #endif
 }
 void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
 {
 #ifdef _KNIGHTS_LANDING_
  ctrs c;
  KNLreadctrs(c);
  uint64_t emr = 0, emw = 0;
  for (int i = 0; i < NEDC; ++i)
    {
      emr += c.edcrd[i];
      emw += c.edcwr[i];
    }
  *mr = emr;
  *mw = emw;
 #else
  *mr = *mw = 0;
 #endif
 }
 #ifdef _KNIGHTS_LANDING_
 struct knl_gbl_ PmuStat::gbl;
 #define PMU_MEM
 void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
 {
  char fname[1024];
  snprintf(fname, sizeof(fname), "%s/type", ename);
  FILE *fp = fopen(fname, "r");
  if (fp == 0) {
    ::printf("open %s", fname);
    ::exit(0);
  }
  int type;
  int ret = fscanf(fp, "%d", &type);
  assert(ret == 1);
  fclose(fp);
  //  std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
  struct perf_event_attr hw = {};
  hw.size = sizeof(hw);
  hw.type = type;
  // see /sys/devices/uncore_*/format/*
  // All of the events we are interested in are configured the same way, but
  // that isn't always true. Proper code would parse the format files
  hw.config = event | (umask << 8);
  //hw.read_format = PERF_FORMAT_GROUP;
  // unfortunately the above only works within a single PMU; might
  // as well just read them one at a time
  int cpu = 0;
  fd = perf_event_open(&hw, -1, cpu, -1, 0);
  if (fd == -1) {
    ::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
    ::exit(0);
  } else { 
    //    std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
  }
 }
 void PmuStat::KNLsetup(void){
   int ret;
   char fname[1024];
   // MC RPQ inserts and WPQ inserts (reads & writes)
   for (int mc = 0; mc < NMC; ++mc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
       // RPQ Inserts
       KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
       // WPQ Inserts
       KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
     }
   // EDC RPQ inserts and WPQ inserts
   for (int edc=0; edc < NEDC; ++edc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
       // RPQ inserts
       KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
       // WPQ inserts
       KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
     }
   // EDC HitE, HitM, MissE, MissM
   for (int edc=0; edc < NEDC; ++edc)
     {
       ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
       KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
       KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
       KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
       KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
     }
 }
 uint64_t PmuStat::KNLreadctr(int fd)
 {
  uint64_t data;
  size_t s = ::read(fd, &data, sizeof(data));
  if (s != sizeof(uint64_t)){
    ::printf("read counter %lu", s);
    ::exit(0);
  }
  return data;
 }
 void PmuStat::KNLreadctrs(ctrs &c)
 {
  for (int i = 0; i < NMC; ++i)
    {
      c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
      c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
    }
  for (int i = 0; i < NEDC; ++i)
    {
      c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
      c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
    }
  for (int i = 0; i < NEDC; ++i)
    {
      c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
      c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
      c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
      c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
    }
 }
 #endif
 }
--- a/lib/Stat.h
+++ b/lib/Stat.h
@ -1,104 +0,0 @@
 #ifndef _GRID_STAT_H
 #define _GRID_STAT_H
 #ifdef AVX512
 #define _KNIGHTS_LANDING_ROOTONLY
 #endif
 namespace Grid { 
 ///////////////////////////////////////////////////////////////////////////////
 // Extra KNL counters from MCDRAM
 ///////////////////////////////////////////////////////////////////////////////
 #ifdef _KNIGHTS_LANDING_
 #define NMC 6
 #define NEDC 8
 struct ctrs
 {
    uint64_t mcrd[NMC];
    uint64_t mcwr[NMC];
    uint64_t edcrd[NEDC]; 
    uint64_t edcwr[NEDC];
    uint64_t edchite[NEDC];
    uint64_t edchitm[NEDC];
    uint64_t edcmisse[NEDC];
    uint64_t edcmissm[NEDC];
 };
 // Peter/Azusa:
 // Our modification of a code provided by Larry Meadows from Intel
 // Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
 // so is already public and in the linux kernel for KNL.
 struct knl_gbl_
 {
  int mc_rd[NMC];
  int mc_wr[NMC];
  int edc_rd[NEDC];
  int edc_wr[NEDC];
  int edc_hite[NEDC];
  int edc_hitm[NEDC];
  int edc_misse[NEDC];
  int edc_missm[NEDC];
 };
 #endif
 ///////////////////////////////////////////////////////////////////////////////
 class PmuStat
 {
    uint64_t counters[8][256];
 #ifdef _KNIGHTS_LANDING_
    static struct knl_gbl_ gbl;
 #endif
    const char *name;
    uint64_t reads;     // memory reads
    uint64_t writes;    // memory writes
    uint64_t mrstart;   // memory read counter at start of parallel region
    uint64_t mrend;     // memory read counter at end of parallel region
    uint64_t mwstart;   // memory write counter at start of parallel region
    uint64_t mwend;     // memory write counter at end of parallel region
    // cumulative counters
    uint64_t count;     // number of invocations
    uint64_t tregion;   // total time in parallel region (from thread 0)
    uint64_t tcycles;   // total cycles inside parallel region
    uint64_t inst, ref, cyc;   // fixed counters
    uint64_t pmc0, pmc1;// pmu
    // add memory counters here
    // temp variables
    uint64_t tstart;    // tsc at start of parallel region
    uint64_t tend;      // tsc at end of parallel region
    // map for ctrs values
    // 0 pmc0 start
    // 1 pmc0 end
    // 2 pmc1 start
    // 3 pmc1 end
    // 4 tsc start
    // 5 tsc end
    static bool pmu_initialized;
 public:
    static bool is_init(void){ return pmu_initialized;}
    static void pmu_init(void);
    static void pmu_fini(void);
    static void pmu_start(void);
    static void pmu_stop(void);
    void accum(int nthreads);
    static void xmemctrs(uint64_t *mr, uint64_t *mw);
    void start(void);
    void enter(int t);
    void exit(int t);
    void print(void);
    void init(const char *regname);
    void clear(void);
 #ifdef _KNIGHTS_LANDING_
    static void     KNLsetup(void);
    static uint64_t KNLreadctr(int fd);
    static void     KNLreadctrs(ctrs &c);
    static void     KNLevsetup(const char *ename, int &fd, int event, int umask);
 #endif
  };
 }
 #endif
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_H
 #define GRID_MATH_H
-#include <Grid/tensors/Tensor_traits.h>
+#include <tensors/Tensor_traits.h>
-#include <Grid/tensors/Tensor_class.h>
+#include <tensors/Tensor_class.h>
-#include <Grid/tensors/Tensor_arith.h>
+#include <tensors/Tensor_arith.h>
-#include <Grid/tensors/Tensor_inner.h>
+#include <tensors/Tensor_inner.h>
-#include <Grid/tensors/Tensor_outer.h>
+#include <tensors/Tensor_outer.h>
-#include <Grid/tensors/Tensor_transpose.h>
+#include <tensors/Tensor_transpose.h>
-#include <Grid/tensors/Tensor_trace.h>
+#include <tensors/Tensor_trace.h>
-#include <Grid/tensors/Tensor_index.h>
+#include <tensors/Tensor_index.h>
-#include <Grid/tensors/Tensor_Ta.h>
+#include <tensors/Tensor_Ta.h>
-#include <Grid/tensors/Tensor_determinant.h>
+#include <tensors/Tensor_determinant.h>
-#include <Grid/tensors/Tensor_exp.h>
+#include <tensors/Tensor_exp.h>
-//#include <Grid/tensors/Tensor_peek.h>
+//#include <tensors/Tensor_peek.h>
-//#include <Grid/tensors/Tensor_poke.h>
+//#include <tensors/Tensor_poke.h>
-#include <Grid/tensors/Tensor_reality.h>
+#include <tensors/Tensor_reality.h>
-#include <Grid/tensors/Tensor_unary.h>
+#include <tensors/Tensor_unary.h>
-#include <Grid/tensors/Tensor_extract_merge.h>
+#include <tensors/Tensor_extract_merge.h>
-#include <Grid/tensors/Tensor_logical.h>
+#include <tensors/Tensor_logical.h>
 #endif
--- a/lib/Threads.h
+++ b/lib/Threads.h
@ -37,11 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>
-#ifdef GRID_NUMA
+#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ")
 #define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
 #else
 #define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
 #endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #else
 #define PARALLEL_FOR_LOOP 
@ -127,22 +123,6 @@ class GridThread {
    ThreadBarrier();
  };
  static void bcopy(const void *src, void *dst, size_t len) {
 #ifdef GRID_OMP
 #pragma omp parallel 
    {
      const char *c_src =(char *) src;
      char *c_dest=(char *) dst;
      int me,mywork,myoff;
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
    }
 #else 
    bcopy(src,dst,len);
 #endif
  }
 };
 }
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_COARSENED_MATRIX_H
 #define  GRID_ALGORITHM_COARSENED_MATRIX_H
 #include <Grid.h>
 namespace Grid {
--- a/lib/algorithms/SparseMatrix.h
+++ b/lib/algorithms/SparseMatrix.h
@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_ALGORITHM_SPARSE_MATRIX_H
 #define  GRID_ALGORITHM_SPARSE_MATRIX_H
 #include <Grid.h>
 namespace Grid {
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@ -29,7 +29,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CHEBYSHEV_H
 #define GRID_CHEBYSHEV_H
-#include <Grid/algorithms/LinearOperator.h>
+#include<Grid.h>
 #include<algorithms/LinearOperator.h>
 namespace Grid {
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@ -18,10 +18,10 @@
 #include <stddef.h>
 #include <Config.h>
-#ifdef HAVE_LIBGMP
+#ifdef HAVE_GMP_H
-#include "bigfloat.h"
+#include <algorithms/approx/bigfloat.h>
 #else
-#include "bigfloat_double.h"
+#include <algorithms/approx/bigfloat_double.h>
 #endif
 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -1,168 +1,150 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/algorithms/iterative/ConjugateGradient.h
+    Source file: ./lib/algorithms/iterative/ConjugateGradient.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_H
 #define GRID_CONJUGATE_GRADIENT_H
 namespace Grid {
-/////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////
-// Base classes for iterative processes based on operators
+    // Base classes for iterative processes based on operators
-// single input vec, single output vec.
+    // single input vec, single output vec.
-/////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////
-template <class Field>
+  template<class Field> 
-class ConjugateGradient : public OperatorFunction<Field> {
+    class ConjugateGradient : public OperatorFunction<Field> {
- public:
+public:                                                
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+    RealD   Tolerance;
-                           // Defaults true.
+    Integer MaxIterations;
-  RealD Tolerance;
+    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
-  Integer MaxIterations;
+    };
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
        ErrorOnNoConverge(err_on_no_conv){};
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
                  Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
-    RealD cp, c, a, d, b, ssq, qq, b_pred;
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
-    Field p(src);
+      psi.checkerboard = src.checkerboard;
-    Field mmp(src);
+      conformable(psi,src);
    Field r(src);
-    // Initial residual computation & set up
+      RealD cp,c,a,d,b,ssq,qq,b_pred;
-    RealD guess = norm2(psi);
+      
-    assert(std::isnan(guess) == 0);
+      Field   p(src);
      Field mmp(src);
      Field   r(src);
      //Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess)==0);
-    
+      Linop.HermOpAndNorm(psi,mmp,d,b);
-    Linop.HermOpAndNorm(psi, mmp, d, b);
+      
-    
+      r= src-mmp;
      p= r;
      a  =norm2(p);
      cp =a;
      ssq=norm2(src);
-    r = src - mmp;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
-    p = r;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
-    a = norm2(p);
+      RealD rsq =  Tolerance* Tolerance*ssq;
-    cp = a;
+      
-    ssq = norm2(src);
+      //Check if guess is really REALLY good :)
-
+      if ( cp <= rsq ) {
-    std::cout << GridLogIterative << std::setprecision(4)
+	return;
              << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:    mp " << d << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:   mmp " << b << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:  cp,r " << cp << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      return;
    }
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq
              << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      c = cp;
      MatrixTimer.Start();
      Linop.HermOpAndNorm(p, mmp, d, qq);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      //  RealD    qqck = norm2(mmp);
      //  ComplexD dck  = innerProduct(p,mmp);
      a = c / d;
      b_pred = a * (a * qq - d) / c;
      cp = axpy_norm(r, -a, mmp, r);
      b = cp / c;
      // Fuse these loops ; should be really easy
      psi = a * p + psi;
      p = p * b + r;
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
        RealD mmpnorm = sqrt(norm2(mmp));
        RealD psinorm = sqrt(norm2(psi));
        RealD srcnorm = sqrt(norm2(src));
        RealD resnorm = sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage
                  << "ConjugateGradient: Converged on iteration " << k << std::endl;
        std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
                  << " true residual " << true_residual << " target "
                  << Tolerance << std::endl;
        std::cout << GridLogMessage << "Time elapsed: Iterations "
                  << SolverTimer.Elapsed() << " Matrix  "
                  << MatrixTimer.Elapsed() << " Linalg "
                  << LinalgTimer.Elapsed();
        std::cout << std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
        return;
      }
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k;
      for (k=1;k<=MaxIterations;k++){
 	c=cp;
 	MatrixTimer.Start();
 	Linop.HermOpAndNorm(p,mmp,d,qq);
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
 	//	RealD    qqck = norm2(mmp);
 	//	ComplexD dck  = innerProduct(p,mmp);
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	LinalgTimer.Stop();
 	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
 	// Stopping condition
 	if ( cp <= rsq ) { 
 	  SolverTimer.Stop();
 	  Linop.HermOpAndNorm(psi,mmp,d,qq);
 	  p=mmp-src;
 	  RealD mmpnorm = sqrt(norm2(mmp));
 	  RealD psinorm = sqrt(norm2(psi));
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual "    <<true_residual
 		   <<" target "<<Tolerance<<std::endl;
 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	  assert(true_residual/Tolerance < 1000.0);
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
      assert(0);
    }
-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
+  };
              << std::endl;
    if (ErrorOnNoConverge) assert(0);
  }
 };
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -1,142 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
 namespace Grid {
  //Mixed precision restarted defect correction CG
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
      guesser = &g;
    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.checkerboard;
      sol_d.checkerboard = cb;
      RealD src_norm = norm2(src_d_in);
      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in._grid;
      FieldD tmp_d(DoublePrecGrid);
      tmp_d.checkerboard = cb;
      FieldD tmp2_d(DoublePrecGrid);
      tmp2_d.checkerboard = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
      RealD inner_tol = Tolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
      FieldF sol_f(SinglePrecGrid);
      sol_f.checkerboard = cb;
      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 	if(norm < OuterLoopNormMult * stop){
 	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	  break;
 	}
 	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 	PrecChangeTimer.Start();
 	precisionChange(src_f, src_d);
 	PrecChangeTimer.Stop();
 	zeroit(sol_f);
 	//Optionally improve inner solver guess (eg using known eigenvectors)
 	if(guesser != NULL)
 	  (*guesser)(src_f, sol_f);
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
 	CG_f(Linop_f, src_f, sol_f);
 	InnerCGtimer.Stop();
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
 	precisionChange(tmp_d, sol_f);
 	PrecChangeTimer.Stop();
 	axpy(sol_d, 1.0, tmp_d, sol_d);
      }
      //Final trial CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
      CG_d(Linop_d, src_d_in, sol_d);
      TotalTimer.Stop();
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,
 }
-#include "Householder.h"
+#include <algorithms/iterative/Householder.h>
-#include "Francis.h"
+#include <algorithms/iterative/Francis.h>
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
-#include "DenseMatrix.h"
+#include <algorithms/iterative/DenseMatrix.h>
-#include "EigenSort.h"
+#include <algorithms/iterative/EigenSort.h>
 namespace Grid {
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -29,6 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_CARTESIAN_BASE_H
 #define GRID_CARTESIAN_BASE_H
 #include <Grid.h>
 namespace Grid{
@ -77,12 +78,15 @@ public:
    // GridCartesian / GridRedBlackCartesian
    ////////////////////////////////////////////////////////////////
    virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> &site)=0;
+    virtual int CheckerBoard(std::vector<int> site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
-    virtual int CheckerBoardFromOindex (int Oindex)=0;
+    int  CheckerBoardFromOindex (int Oindex){
-    virtual int CheckerBoardFromOindexTable (int Oindex)=0;
+      std::vector<int> ocoor;
      oCoorFromOindex(ocoor,Oindex); 
      return CheckerBoard(ocoor);
    }
    //////////////////////////////////////////////////////////////////////////////////////////////
    // Local layout calculations
@ -103,12 +107,6 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline int oIndexReduced(std::vector<int> &ocoor)
    {
      int idx=0; 
@ -125,6 +123,12 @@ public:
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
    inline int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
        return idx;
    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
@ -216,7 +220,7 @@ public:
      }
      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
-      o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim
+      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
    }
    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@ -39,17 +39,10 @@ class GridCartesian: public GridBase {
 public:
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return 0;
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      return 0;
    }
    virtual int CheckerBoarded(int dim){
      return 0;
    }
-    virtual int CheckerBoard(std::vector<int> &site){
+    virtual int CheckerBoard(std::vector<int> site){
        return 0;
    }
    virtual int CheckerBoardDestination(int cb,int shift,int dim){
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -32,24 +32,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-  static const int CbRed  =0;
+    static const int CbRed  =0;
-  static const int CbBlack=1;
+    static const int CbBlack=1;
-  static const int Even   =CbRed;
+    static const int Even   =CbRed;
-  static const int Odd    =CbBlack;
+    static const int Odd    =CbBlack;
-    
+
    // Perhaps these are misplaced and 
    // should be in sparse matrix.
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
 // Specialise this for red black grids storing half the data like a chess board.
 class GridRedBlackCartesian : public GridBase
 {
 public:
    std::vector<int> _checker_dim_mask;
    int              _checker_dim;
    std::vector<int> _checker_board;
    virtual int CheckerBoarded(int dim){
      if( dim==_checker_dim) return 1;
      else return 0;
    }
-    virtual int CheckerBoard(std::vector<int> &site){
+    virtual int CheckerBoard(std::vector<int> site){
      int linear=0;
      assert(site.size()==_ndimension);
      for(int d=0;d<_ndimension;d++){ 
@ -73,20 +78,12 @@ public:
      // or by looping over x,y,z and multiply rather than computing checkerboard.
      if ( (source_cb+ocb)&1 ) {
 	return (shift)/2;
      } else {
 	return (shift+1)/2;
      }
    }
    virtual int  CheckerBoardFromOindexTable (int Oindex) {
      return _checker_board[Oindex];
    }
    virtual int  CheckerBoardFromOindex (int Oindex)
    {
      std::vector<int> ocoor;
      oCoorFromOindex(ocoor,Oindex);
      return CheckerBoard(ocoor);
    }
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
      if(dim != _checker_dim) return shift;
@ -178,7 +175,7 @@ public:
 	// all elements of a simd vector must have same checkerboard.
 	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
-	  if ( checker_dim_mask[d] ) { 
+	  if ( d != _checker_dim ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
@ -194,8 +191,6 @@ public:
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
      }
      ////////////////////////////////////////////////////////////////////////////////////////////
@ -216,18 +211,6 @@ public:
 	_slice_nblock[d]=nblock;
 	block = block*_rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
      for(int d=0;d<_ndimension;d++){
 	rvol=rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
      for(int osite=0;osite<_osites;osite++){
 	_checker_board[osite] = CheckerBoardFromOindex (osite);
      }
    };
 protected:
@ -241,21 +224,9 @@ protected:
 	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 	}
      }
-      return idx;
+        return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
        int idx=0;
        for(int d=0;d<_ndimension;d++) {
 	  if( d==_checker_dim ) {
 	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
 	  } else { 
 	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
 	  }
 	}
        return idx;
    }
 };
 }
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -1,132 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 int CartesianCommunicator::Slave;
 void *              CartesianCommunicator::ShmCommBuf;
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  std::cout <<"Shm alloc "<<ptr<<std::endl;
  assert(heap_bytes < MAX_MPI_SHM_BYTES);
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 int  CartesianCommunicator::RankWorld(void){ return WorldRank; };
 int CartesianCommunicator::Ranks    (void) { return WorldSize; };
 int CartesianCommunicator::Nodes    (void) { return GroupSize; };
 int CartesianCommunicator::Cores    (void) { return ShmSize;   };
 int CartesianCommunicator::NodeRank (void) { return GroupRank; };
 int CartesianCommunicator::CoreRank (void) { return ShmRank;   };
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 #ifndef GRID_COMMS_MPI3
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int xmit_to_rank,
 						       void *recv,
 						       int recv_from_rank,
 						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 }
 #endif
 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -34,194 +34,123 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
 class CartesianCommunicator {
  public:    
  // 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
  static const int      MAXLOG2RANKSPERNODE = 16;            
  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+    int              _Nprocessors;     // How many in all
-  MPI_Comm communicator;
+    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
-  static MPI_Comm communicator_world;
+    int              _processor;       // linear processor rank
-  typedef MPI_Request CommsRequest_t;
+    std::vector<int> _processor_coor;  // linear processor coordinate
    unsigned long _ndimension;
 #ifdef GRID_COMMS_MPI
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
 #else 
-  typedef int CommsRequest_t;
+    typedef int CommsRequest_t;
 #endif
-  ////////////////////////////////////////////////////////////////////
+    static void Init(int *argc, char ***argv);
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
 #ifdef  GRID_COMMS_MPI3
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;
  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
  std::vector<int>  LexicographicToWorldRank;
  static std::vector<void *> ShmCommBufs;
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
  static void * ShmCommBuf;
  size_t heap_top;
  size_t heap_bytes;
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void ShmBufferFreeAll(void) ;
  ////////////////////////////////////////////////
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);
  ////////////////////////////////////////////////
  // Constructor of any given grid
  ////////////////////////////////////////////////
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
  void ShiftedRanks(int dim,int shift,int & source, int & dest);
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  /////////////////////////////////
  // Grid information and queries
  /////////////////////////////////
  static int ShmRank;
  static int ShmSize;
  static int GroupSize;
  static int GroupRank;
  static int WorldRank;
  static int WorldSize;
  static int Slave;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  static int Ranks    (void);
  static int Nodes    (void);
  static int Cores    (void);
  static int NodeRank (void);
  static int CoreRank (void);
-  ////////////////////////////////////////////////////////////////////////////////
+    // Constructor
-  // very VERY rarely (Log, serial RNG) we need world without a grid
+    CartesianCommunicator(const std::vector<int> &pdimensions_in);
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
 		      int recv_from_rank,
 		      int bytes);
  void SendRecvPacket(void *xmit,
 		      void *recv,
 		      int xmit_to_rank,
 		      int recv_from_rank,
 		      int bytes);
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			   void *xmit,
 			   int xmit_to_rank,
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+    // Wraps MPI_Cart routines
-				  void *xmit,
+    void ShiftedRanks(int dim,int shift,int & source, int & dest);
-				  int xmit_to_rank,
+    int  RankFromProcessorCoor(std::vector<int> &coor);
-				  void *recv,
+    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
 				  int recv_from_rank,
 				  int bytes);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  void StencilBarrier(void);
-  ////////////////////////////////////////////////////////////
+    /////////////////////////////////
-  // Barrier
+    // Grid information queries
-  ////////////////////////////////////////////////////////////
+    /////////////////////////////////
-  void Barrier(void);
+    int                      IsBoss(void)            { return _processor==0; };
-  
+    int                      BossRank(void)          { return 0; };
-  ////////////////////////////////////////////////////////////
+    int                      ThisRank(void)          { return _processor; };
-  // Broadcast a buffer and composite larger
+    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
-  ////////////////////////////////////////////////////////////
+    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
-  void Broadcast(int root,void* data, int bytes);
+    int                      ProcessorCount(void)    { return _Nprocessors; };
-  
+
-  template<class obj> void Broadcast(int root,obj &data)
+    ////////////////////////////////////////////////////////////
    // Reduction
    ////////////////////////////////////////////////////////////
    void GlobalSum(RealF &);
    void GlobalSumVector(RealF *,int N);
    void GlobalSum(RealD &);
    void GlobalSumVector(RealD *,int N);
    void GlobalSum(uint32_t &);
    void GlobalSum(uint64_t &);
    void GlobalSum(ComplexF &c)
    {
      GlobalSumVector((float *)&c,2);
    }
    void GlobalSumVector(ComplexF *c,int N)
    {
      GlobalSumVector((float *)c,2*N);
    }
    void GlobalSum(ComplexD &c)
    {
      GlobalSumVector((double *)&c,2);
    }
    void GlobalSumVector(ComplexD *c,int N)
    {
      GlobalSumVector((double *)c,2*N);
    }
    template<class obj> void GlobalSum(obj &o){
      typedef typename obj::scalar_type scalar_type;
      int words = sizeof(obj)/sizeof(scalar_type);
      scalar_type * ptr = (scalar_type *)& o;
      GlobalSumVector(ptr,words);
    }
    ////////////////////////////////////////////////////////////
    // Face exchange, buffer swap in translational invariant way
    ////////////////////////////////////////////////////////////
    void SendToRecvFrom(void *xmit,
 			int xmit_to_rank,
 			void *recv,
 			int recv_from_rank,
 			int bytes);
    void SendRecvPacket(void *xmit,
 			void *recv,
 			int xmit_to_rank,
 			int recv_from_rank,
 			int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
    ////////////////////////////////////////////////////////////
    // Barrier
    ////////////////////////////////////////////////////////////
    void Barrier(void);
    ////////////////////////////////////////////////////////////
    // Broadcast a buffer and composite larger
    ////////////////////////////////////////////////////////////
    void Broadcast(int root,void* data, int bytes);
    template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };
    static void BroadcastWorld(int root,void* data, int bytes);
 }; 
 }
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -30,30 +30,21 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-
+  // Should error check all MPI calls.
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
  int Rank(void) {
    int pe;
    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
    return pe;
  }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
@ -63,7 +54,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _processors = processors;
  _processor_coor.resize(_ndimension);
-  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -76,6 +67,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@ -176,6 +168,7 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
@ -194,17 +187,14 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-  ///////////////////////////////////////////////////////
+
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      communicator_world);
+		      MPI_COMM_WORLD);
  assert(ierr==0);
 }
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -1,574 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
 MPI_Comm CartesianCommunicator::communicator_world;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
 std::vector<int> CartesianCommunicator::GroupRanks;  
 std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  GroupSize = WorldSize/ShmSize;
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (communicator_world, &WorldGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
  MyGroup.resize(ShmSize);
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
    ///////////////////////////////////////////////////////////////////
  int g=0;
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(GroupSize,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  GroupRank=-1;
  for(int g=0;g<GroupSize;g++){
    if (myleader == leaders_group[g]){
      GroupRank=g;
    }
  }
  assert(GroupRank!=-1);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = 0;
  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
  assert(ierr==0);
  // KNL hack -- force to numa-domain 1 in flat
 #if 0
  //#include <numaif.h>
  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
    void *pages = (void *) ( page + ShmCommBuf );
    int status;
    int flags=MPOL_MF_MOVE_ALL;
    int nodes=1; // numa domain == MCDRAM
    unsigned long count=1;
    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
    if (ierr && (page==0)) perror("numa relocate command failed");
  }
 #endif
  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Verbose for now
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  if (WorldRank == 0){
    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
    std::cout<< WorldSize << " Ranks " ;
    std::cout<< GroupSize << " Nodes " ;
    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
    for(int g=0;g<GroupSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
    }
    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
    for(int g=0;g<ShmSize;g++){
      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
      if(g!=ShmSize-1) std::cout<<",";
      else std::cout<<"}"<<std::endl;
    }
  }
  for(int g=0;g<GroupSize;g++){
    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
    for(int r=0;r<ShmSize;r++){
      if ( (ShmRank == 0) && (GroupRank==g) ) {
 	std::cout<<MyGroup[r];
 	if(r<ShmSize-1) std::cout<<",";
 	else std::cout<<"}"<<std::endl;
      }
      MPI_Barrier(communicator_world);
    }
  }
  assert(ShmSetup==0);  ShmSetup=1;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Want to implement some magic ... Group sub-cubes into those on same node
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  source = LexicographicToWorldRank[source];
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
  dest = LexicographicToWorldRank[dest];
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = -1;
  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
    if ( (0x1<<i) == ShmSize ) {
      log2size = i;
      break;
    }
  }
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  std::vector<int> WorldDims = processors;
  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%_ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<_ndimension;d++){
    GroupDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  assert(WorldSize==_Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  // 
  ////////////////////////////////////////////////////////////////
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
  for(int d=0;d<_ndimension;d++){
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
  }
  _processor_coor = WorldCoor;
  int lexico;
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico]=WorldRank;
  _processor = lexico;
  ///////////////////////////////////////////////////////////////////
  // global sum Lexico to World mapping
  ///////////////////////////////////////////////////////////////////
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
 #if 0
  this->StencilBarrier();
  MPI_Request xrq;
  MPI_Request rrq;
  static int sequence;
  int ierr;
  int tag;
  int check;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
  sequence++;
  char *from_ptr = (char *)ShmCommBufs[ShmRank];
  int small = (bytes<MAX_MPI_SHM_BYTES);
  typedef uint64_t T;
  int words = bytes/sizeof(T);
  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);
  if ( small && (gdest !=MPI_UNDEFINED) ) {
    char *to_ptr   = (char *)ShmCommBufs[gdest];
    assert(gme != gdest);
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
  }
  this->StencilBarrier();
  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
    assert(tag==from);
  } else { 
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
  this->StencilBarrier();
 #else
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 #endif
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
 						       int from,
 						       int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
  assert(gme == ShmRank);
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
  }
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  SendToRecvFromComplete(list);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 }
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -28,22 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
  WorldRank = 0;
  WorldSize = 1;
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
 int Rank(void ){ return 0; };
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@ -99,16 +89,30 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }
-void CartesianCommunicator::Barrier(void){}
+void CartesianCommunicator::Barrier(void)
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
+{
-void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
+}
-int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
+
-void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
  dest=0;
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  return 0;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
 }
 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -39,22 +39,17 @@ namespace Grid {
    BACKTRACEFILE();		   \
  }\
 }
-
+int Rank(void) {
-
+  return shmem_my_pe();
-///////////////////////////////////////////////////////////////////////////////////////////////////
+}
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@ -65,17 +60,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
  WorldSize = shmem_n_pes();
  WorldRank = shmem_my_pe();
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  shmem_barrier_all();
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
@ -244,9 +230,12 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  if ( _processor == sender ) {
    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
    // Check he has posted a receive
    while(SendSeq->seq_remote == SendSeq->seq_local);
    printf("Sender receive %d posted\n",sender,receiver);
    // Advance our send count
    seq = ++(SendSeq->seq_local);
@ -255,19 +244,26 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
    shmem_putmem(recv,xmit,bytes,receiver);
    shmem_fence();
    printf("Sender sent payload %d\n",seq);
    //Notify him we're done
    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
    shmem_fence();
    printf("Sender ringing door bell  %d\n",seq);
  }
  if ( _processor == receiver ) {
    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
    // Post a receive
    seq = ++(RecvSeq->seq_local);
    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
    printf("Receiver Opening letter box %d\n",seq);
    // Now wait until he has advanced our reception counter
    while(RecvSeq->seq_remote != RecvSeq->seq_local);
    printf("Receiver Got the mail %d\n",seq);
  }
 }
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@ -1,4 +1,3 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -45,7 +44,7 @@ public:
 // Gather for when there is no need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@ -57,7 +56,6 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
@ -70,20 +68,15 @@ PARALLEL_NESTED_LOOP2
    }
  } else { 
     int bo=0;
     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	 if ( ocb &cbmask ) {
-	   table.push_back(std::pair<int,int> (bo++,o+b));
+	   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
 	 }
       }
     }
 PARALLEL_FOR_LOOP     
     for(int i=0;i<table.size();i++){
       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
     }
  }
 }
@ -114,7 +107,6 @@ PARALLEL_NESTED_LOOP2
 	int o      =   n*n1;
 	int offset = b+n*n2;
 	cobj temp =compress(rhs._odata[so+o+b]);
 	extract<cobj>(temp,pointers,offset);
      }
@ -122,7 +114,6 @@ PARALLEL_NESTED_LOOP2
  } else { 
    assert(0); //Fixme think this is buggy
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*rhs._grid->_slice_stride[dimension];
@ -141,7 +132,7 @@ PARALLEL_NESTED_LOOP2
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
 {
  SimpleCompressor<vobj> dontcompress;
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
@ -159,7 +150,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  commVector<vobj> send_buf(buffer_size);
+  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
-  commVector<vobj> recv_buf(buffer_size);
+  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -191,8 +191,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);
-  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
-  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
  int bytes = buffer_size*sizeof(scalar_object);
--- a/lib/fftw/fftw3.h
+++ b/lib/fftw/fftw3.h
@ -1,412 +0,0 @@
 /*
 * Copyright (c) 2003, 2007-14 Matteo Frigo
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
 *
 * The following statement of license applies *only* to this header file,
 * and *not* to the other files distributed with FFTW or derived therefrom:
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /***************************** NOTE TO USERS *********************************
 *
 *                 THIS IS A HEADER FILE, NOT A MANUAL
 *
 *    If you want to know how to use FFTW, please read the manual,
 *    online at http://www.fftw.org/doc/ and also included with FFTW.
 *    For a quick start, see the manual's tutorial section.
 *
 *   (Reading header files to learn how to use a library is a habit
 *    stemming from code lacking a proper manual.  Arguably, it's a
 *    *bad* habit in most cases, because header files can contain
 *    interfaces that are not part of the public, stable API.)
 *
 ****************************************************************************/
 #ifndef FFTW3_H
 #define FFTW3_H
 #include <stdio.h>
 #ifdef __cplusplus
 extern "C"
 {
 #endif /* __cplusplus */
 /* If <complex.h> is included, use the C99 complex type.  Otherwise
   define a type bit-compatible with C99 complex */
 #if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
 #  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
 #else
 #  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
 #endif
 #define FFTW_CONCAT(prefix, name) prefix ## name
 #define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
 #define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
 #define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
 #define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
 /* IMPORTANT: for Windows compilers, you should add a line
        #define FFTW_DLL
   here and in kernel/ifftw.h if you are compiling/using FFTW as a
   DLL, in order to do the proper importing/exporting, or
   alternatively compile with -DFFTW_DLL or the equivalent
   command-line flag.  This is not necessary under MinGW/Cygwin, where
   libtool does the imports/exports automatically. */
 #if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
   /* annoying Windows syntax for shared-library declarations */
 #  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
 #    define FFTW_EXTERN extern __declspec(dllexport) 
 #  else /* user is calling FFTW; import symbol */
 #    define FFTW_EXTERN extern __declspec(dllimport) 
 #  endif
 #else
 #  define FFTW_EXTERN extern
 #endif
 enum fftw_r2r_kind_do_not_use_me {
     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
 };
 struct fftw_iodim_do_not_use_me {
     int n;                     /* dimension size */
     int is;			/* input stride */
     int os;			/* output stride */
 };
 #include <stddef.h> /* for ptrdiff_t */
 struct fftw_iodim64_do_not_use_me {
     ptrdiff_t n;                     /* dimension size */
     ptrdiff_t is;			/* input stride */
     ptrdiff_t os;			/* output stride */
 };
 typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
 typedef int (*fftw_read_char_func_do_not_use_me)(void *);
 /*
  huge second-order macro that defines prototypes for all API
  functions.  We expand this macro for each supported precision
  X: name-mangling macro
  R: real data type
  C: complex data type
 */
 #define FFTW_DEFINE_API(X, R, C)					   \
 									   \
 FFTW_DEFINE_COMPLEX(R, C);						   \
 									   \
 typedef struct X(plan_s) *X(plan);					   \
 									   \
 typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
 typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
 									   \
 typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
 									   \
 typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
 typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
 									   \
 FFTW_EXTERN void X(execute)(const X(plan) p);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
 		    C *in, C *out, int sign, unsigned flags);		   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
 		       unsigned flags);					   \
 FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
 		       C *in, C *out, int sign, unsigned flags);	   \
 FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
 		       C *in, C *out, int sign, unsigned flags);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
                         int howmany,					   \
                         C *in, const int *inembed,			   \
                         int istride, int idist,			   \
                         C *out, const int *onembed,			   \
                         int ostride, int odist,			   \
                         int sign, unsigned flags);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
 			 int howmany_rank,				   \
 			 const X(iodim) *howmany_dims,			   \
 			 C *in, C *out,					   \
 			 int sign, unsigned flags);			   \
 FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
 			 int howmany_rank,				   \
 			 const X(iodim) *howmany_dims,			   \
 			 R *ri, R *ii, R *ro, R *io,			   \
 			 unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
                         const X(iodim64) *dims,			   \
 			 int howmany_rank,				   \
 			 const X(iodim64) *howmany_dims,		   \
 			 C *in, C *out,					   \
 			 int sign, unsigned flags);			   \
 FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
                         const X(iodim64) *dims,			   \
 			 int howmany_rank,				   \
 			 const X(iodim64) *howmany_dims,		   \
 			 R *ri, R *ii, R *ro, R *io,			   \
 			 unsigned flags);				   \
 									   \
 FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
 FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
                                      R *ro, R *io);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
                             int howmany,				   \
                             R *in, const int *inembed,			   \
                             int istride, int idist,			   \
                             C *out, const int *onembed,		   \
                             int ostride, int odist,			   \
                             unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
                        R *in, C *out, unsigned flags);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
 FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
 			   R *in, C *out, unsigned flags);		   \
 FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
 			   int n2,					   \
 			   R *in, C *out, unsigned flags);		   \
 									   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
 			     int howmany,				   \
 			     C *in, const int *inembed,			   \
 			     int istride, int idist,			   \
 			     R *out, const int *onembed,		   \
 			     int ostride, int odist,			   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
                        C *in, R *out, unsigned flags);			   \
 									   \
 FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
 FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
 			   C *in, R *out, unsigned flags);		   \
 FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
 			   int n2,					   \
 			   C *in, R *out, unsigned flags);		   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     R *in, C *out,				   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     C *in, R *out,				   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
                             int rank, const X(iodim) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     R *in, R *ro, R *io,			   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
                             int rank, const X(iodim) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim) *howmany_dims,		   \
 			     R *ri, R *ii, R *out,			   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
                             const X(iodim64) *dims,			   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     R *in, C *out,				   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
                             const X(iodim64) *dims,			   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     C *in, R *out,				   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
                             int rank, const X(iodim64) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     R *in, R *ro, R *io,			   \
 			     unsigned flags);				   \
 FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
                             int rank, const X(iodim64) *dims,		   \
 			     int howmany_rank,				   \
 			     const X(iodim64) *howmany_dims,		   \
 			     R *ri, R *ii, R *out,			   \
 			     unsigned flags);				   \
 									   \
 FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
 FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
 									   \
 FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
                                          R *in, R *ro, R *io);		   \
 FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
                                          R *ri, R *ii, R *out);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
                         int howmany,					   \
                         R *in, const int *inembed,			   \
                         int istride, int idist,			   \
                         R *out, const int *onembed,			   \
                         int ostride, int odist,			   \
                         const X(r2r_kind) *kind, unsigned flags);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
                    const X(r2r_kind) *kind, unsigned flags);		   \
 									   \
 FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
                       X(r2r_kind) kind, unsigned flags);		   \
 FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
                       unsigned flags);					   \
 FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
                       R *in, R *out, X(r2r_kind) kind0,		   \
                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
                       unsigned flags);					   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
                         int howmany_rank,				   \
                         const X(iodim) *howmany_dims,			   \
                         R *in, R *out,					   \
                         const X(r2r_kind) *kind, unsigned flags);	   \
 									   \
 FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
                         int howmany_rank,				   \
                         const X(iodim64) *howmany_dims,		   \
                         R *in, R *out,					   \
                         const X(r2r_kind) *kind, unsigned flags);	   \
 									   \
 FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
 									   \
 FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
 FFTW_EXTERN void X(forget_wisdom)(void);				   \
 FFTW_EXTERN void X(cleanup)(void);					   \
 									   \
 FFTW_EXTERN void X(set_timelimit)(double t);				   \
 									   \
 FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
 FFTW_EXTERN int X(init_threads)(void);					   \
 FFTW_EXTERN void X(cleanup_threads)(void);				   \
 									   \
 FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
 FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
 FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
 FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
                                  void *data);				   \
 FFTW_EXTERN int X(import_system_wisdom)(void);				   \
 FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
 FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
 FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
 FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
 									   \
 FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
 FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
 FFTW_EXTERN char *X(sprint_plan)(const X(plan) p);			   \
 									   \
 FFTW_EXTERN void *X(malloc)(size_t n);					   \
 FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
 FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
 FFTW_EXTERN void X(free)(void *p);					   \
 									   \
 FFTW_EXTERN void X(flops)(const X(plan) p,				   \
                          double *add, double *mul, double *fmas);	   \
 FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
 FFTW_EXTERN double X(cost)(const X(plan) p);				   \
 									   \
 FFTW_EXTERN int X(alignment_of)(R *p);                                     \
 FFTW_EXTERN const char X(version)[];                                       \
 FFTW_EXTERN const char X(cc)[];						   \
 FFTW_EXTERN const char X(codelet_optim)[];
 /* end of FFTW_DEFINE_API macro */
 FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
 FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
 FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
 /* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
 #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
 && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
 && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
 #  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
 /* note: __float128 is a typedef, which is not supported with the _Complex
         keyword in gcc, so instead we use this ugly __attribute__ version.
         However, we can't simply pass the __attribute__ version to
         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
 #    undef FFTW_DEFINE_COMPLEX
 #    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
 #  endif
 FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
 #endif
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #define FFTW_NO_TIMELIMIT (-1.0)
 /* documented flags */
 #define FFTW_MEASURE (0U)
 #define FFTW_DESTROY_INPUT (1U << 0)
 #define FFTW_UNALIGNED (1U << 1)
 #define FFTW_CONSERVE_MEMORY (1U << 2)
 #define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
 #define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
 #define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
 #define FFTW_ESTIMATE (1U << 6)
 #define FFTW_WISDOM_ONLY (1U << 21)
 /* undocumented beyond-guru flags */
 #define FFTW_ESTIMATE_PATIENT (1U << 7)
 #define FFTW_BELIEVE_PCOST (1U << 8)
 #define FFTW_NO_DFT_R2HC (1U << 9)
 #define FFTW_NO_NONTHREADED (1U << 10)
 #define FFTW_NO_BUFFERING (1U << 11)
 #define FFTW_NO_INDIRECT_OP (1U << 12)
 #define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
 #define FFTW_NO_RANK_SPLITS (1U << 14)
 #define FFTW_NO_VRANK_SPLITS (1U << 15)
 #define FFTW_NO_VRECURSE (1U << 16)
 #define FFTW_NO_SIMD (1U << 17)
 #define FFTW_NO_SLOW (1U << 18)
 #define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
 #define FFTW_ALLOW_PRUNING (1U << 20)
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif /* __cplusplus */
 #endif /* FFTW3_H */
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@ -1,74 +1,73 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/lattice/Lattice_ET.h
+    Source file: ./lib/lattice/Lattice_ET.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H
 #include <iostream>
 #include <vector>
 #include <tuple>
 #include <typeinfo>
 #include <vector>
 namespace Grid {
-////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////
-// Predicated where support
+  // Predicated where support
-////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////
-template <class iobj, class vobj, class robj>
+  template<class iobj,class vobj,class robj>
-inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
                            const robj &iffalse) {
  typename std::remove_const<vobj>::type ret;
-  typedef typename vobj::scalar_object scalar_object;
+    typename std::remove_const<vobj>::type ret;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
-  const int Nsimd = vobj::vector_type::Nsimd();
+    typedef typename vobj::scalar_object scalar_object;
-  const int words = sizeof(vobj) / sizeof(vector_type);
+    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
-  std::vector<Integer> mask(Nsimd);
+    const int Nsimd = vobj::vector_type::Nsimd();
-  std::vector<scalar_object> truevals(Nsimd);
+    const int words = sizeof(vobj)/sizeof(vector_type);
  std::vector<scalar_object> falsevals(Nsimd);
-  extract(iftrue, truevals);
+    std::vector<Integer> mask(Nsimd);
-  extract(iffalse, falsevals);
+    std::vector<scalar_object> truevals (Nsimd);
-  extract<vInteger, Integer>(TensorRemove(predicate), mask);
+    std::vector<scalar_object> falsevals(Nsimd);
-  for (int s = 0; s < Nsimd; s++) {
+    extract(iftrue   ,truevals);
-    if (mask[s]) falsevals[s] = truevals[s];
+    extract(iffalse  ,falsevals);
    extract<vInteger,Integer>(TensorRemove(predicate),mask);
    for(int s=0;s<Nsimd;s++){
      if (mask[s]) falsevals[s]=truevals[s];
    }
    merge(ret,falsevals);
    return ret;
  }
  merge(ret, falsevals);
  return ret;
 }
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@ -76,351 +75,303 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
 // from tuple is hideous; C++14 introduces std::make_index_sequence for this
 ////////////////////////////////////////////
 // leaf eval of lattice ; should enable if protect using traits
-template <typename T>
+//leaf eval of lattice ; should enable if protect using traits
 using is_lattice = std::is_base_of<LatticeBase, T>;
-template <typename T>
+template <typename T> using is_lattice      = std::is_base_of<LatticeBase,T >;
 using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
 template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
 //Specialization of getVectorType for lattices
 template<typename T>
 struct getVectorType<Lattice<T> >{
  typedef typename Lattice<T>::vector_object type;
 };
 template<class sobj>
 inline sobj eval(const unsigned int ss, const sobj &arg)
 {
  return arg;
 }
-template <class lobj>
+template<class lobj>
-inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg)
-  return arg._odata[ss];
+{
    return arg._odata[ss];
 }
 // handle nodes in syntax tree
 template <typename Op, typename T1>
-auto inline eval(
+auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand
-    const unsigned int ss,
+  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second))))
-    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
+{
-    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
+  return expr.first.func(eval(ss,std::get<0>(expr.second)));
  return expr.first.func(eval(ss, std::get<0>(expr.second)));
 }
 template <typename Op, typename T1, typename T2>
-auto inline eval(
+auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands
-    const unsigned int ss,
+  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))))
-    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
+{
-    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)));
                                eval(ss, std::get<1>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-auto inline eval(const unsigned int ss,
+auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands
-                 const LatticeTrinaryExpression<Op, T1, T2, T3>
+  -> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second))))
-                     &expr)  // eval three operands
+{
-    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+  return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) );
                                eval(ss, std::get<1>(expr.second)),
                                eval(ss, std::get<2>(expr.second)))) {
  return expr.first.func(eval(ss, std::get<0>(expr.second)),
                         eval(ss, std::get<1>(expr.second)),
                         eval(ss, std::get<2>(expr.second)));
 }
 //////////////////////////////////////////////////////////////////////////
-// Obtain the grid from an expression, ensuring conformable. This must follow a
+// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
-template <class T1,
+template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase * &grid,const T1& lat)   // Lattice leaf
-inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
+{
  if ( grid ) {
    conformable(grid,lat._grid);
  } 
  grid=lat._grid;
 }
 template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 inline void GridFromExpression(GridBase * &grid,const T1& notlat)   // non-lattice leaf
 {
  if (grid) {
    conformable(grid, lat._grid);
  }
  grid = lat._grid;
 }
 template <class T1,
          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
 inline void GridFromExpression(GridBase *&grid,
                               const T1 &notlat)  // non-lattice leaf
 {}
 template <typename Op, typename T1>
-inline void GridFromExpression(GridBase *&grid,
+inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr)
-                               const LatticeUnaryExpression<Op, T1> &expr) {
+{
-  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid,std::get<0>(expr.second));// recurse 
 }
 template <typename Op, typename T1, typename T2>
-inline void GridFromExpression(
+inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
+{
-  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid,std::get<1>(expr.second));
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void GridFromExpression(
+inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+{
-  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid,std::get<0>(expr.second));// recurse
-  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid,std::get<1>(expr.second));
-  GridFromExpression(grid, std::get<2>(expr.second));
+  GridFromExpression(grid,std::get<2>(expr.second));
 }
 //////////////////////////////////////////////////////////////////////////
-// Obtain the CB from an expression, ensuring conformable. This must follow a
+// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
 // tree recursion
 //////////////////////////////////////////////////////////////////////////
-template <class T1,
+template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
-          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
 inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
 {
-  if ((cb == Odd) || (cb == Even)) {
+  if ( (cb==Odd) || (cb==Even) ) {
-    assert(cb == lat.checkerboard);
+    assert(cb==lat.checkerboard);
-  }
+  } 
-  cb = lat.checkerboard;
+  cb=lat.checkerboard;
  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
 }
-template <class T1,
+template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
-          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
 inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
 {
  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
 }
 template <typename Op, typename T1>
-inline void CBFromExpression(int &cb,
+inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
-                             const LatticeUnaryExpression<Op, T1> &expr) {
+{
-  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2>
-inline void CBFromExpression(int &cb,
+inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
-                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
+{
-  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb,std::get<1>(expr.second));
  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
 }
 template <typename Op, typename T1, typename T2, typename T3>
-inline void CBFromExpression(
+inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
-    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+{
-  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb,std::get<0>(expr.second));// recurse
-  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb,std::get<1>(expr.second));
-  CBFromExpression(cb, std::get<2>(expr.second));
+  CBFromExpression(cb,std::get<2>(expr.second));
  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
 }
 ////////////////////////////////////////////
 // Unary operators and funcs
 ////////////////////////////////////////////
-#define GridUnopClass(name, ret)                                          \
+#define GridUnopClass(name,ret)\
-  template <class arg>                                                    \
+template <class arg> struct name\
-  struct name {                                                           \
+{\
-    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
+  static auto inline func(const arg a)-> decltype(ret) { return ret; } \
-  };
+};
-GridUnopClass(UnarySub, -a);
+GridUnopClass(UnarySub,-a);
-GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryNot,Not(a));
-GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryAdj,adj(a));
-GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryConj,conjugate(a));
-GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTrace,trace(a));
-GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTranspose,transpose(a));
-GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryTa,Ta(a));
-GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryProjectOnGroup,ProjectOnGroup(a));
-GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryReal,real(a));
-GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryImag,imag(a));
-GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToReal,toReal(a));
-GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryToComplex,toComplex(a));
-GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryAbs,abs(a));
-GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnarySqrt,sqrt(a));
-GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnaryRsqrt,rsqrt(a));
-GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnarySin,sin(a));
-GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnaryCos,cos(a));
-GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryLog,log(a));
-GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryExp,exp(a));
 GridUnopClass(UnaryAsin, asin(a));
 GridUnopClass(UnaryAcos, acos(a));
 GridUnopClass(UnaryLog, log(a));
 GridUnopClass(UnaryExp, exp(a));
 ////////////////////////////////////////////
 // Binary operators
 ////////////////////////////////////////////
-#define GridBinOpClass(name, combination)                      \
+#define GridBinOpClass(name,combination)\
-  template <class left, class right>                           \
+template <class left,class right>\
-  struct name {                                                \
+struct name\
-    static auto inline func(const left &lhs, const right &rhs) \
+{\
-        -> decltype(combination) const {                       \
+  static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \
-      return combination;                                      \
+    {\
-    }                                                          \
+      return combination;\
-  }
+    }\
-GridBinOpClass(BinaryAdd, lhs + rhs);
+}
-GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryAdd,lhs+rhs);
-GridBinOpClass(BinaryMul, lhs *rhs);
+GridBinOpClass(BinarySub,lhs-rhs);
 GridBinOpClass(BinaryMul,lhs*rhs);
-GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryAnd   ,lhs&rhs);
-GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryOr    ,lhs|rhs);
-GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryAndAnd,lhs&&rhs);
-GridBinOpClass(BinaryOrOr, lhs || rhs);
+GridBinOpClass(BinaryOrOr  ,lhs||rhs);
 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
-#define GridTrinOpClass(name, combination)                                     \
+#define GridTrinOpClass(name,combination)\
-  template <class predicate, class left, class right>                          \
+template <class predicate,class left, class right>	\
-  struct name {                                                                \
+struct name\
-    static auto inline func(const predicate &pred, const left &lhs,            \
+{\
-                            const right &rhs) -> decltype(combination) const { \
+  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
-      return combination;                                                      \
+    {\
-    }                                                                          \
+      return combination;\
-  }
+    }\
 }
-GridTrinOpClass(
+GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
-    TrinaryWhere,
+			       typename std::remove_reference<left>::type, \
-    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
+			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
                     typename std::remove_reference<right>::type>(pred, lhs,
                                                                  rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
 #define GRID_UNOP(name)   name<decltype(eval(0, arg))>
 #define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
 #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
-#define GRID_UNOP(name) name<decltype(eval(0, arg))>
+#define GRID_DEF_UNOP(op, name)\
-#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+template <typename T1,\
-#define GRID_TRINOP(name) \
+  typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \
-  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+  -> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \
 { return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); }
-#define GRID_DEF_UNOP(op, name)                                             \
+#define GRID_BINOP_LEFT(op, name)\
-  template <typename T1,                                                    \
+template <typename T1,typename T2,\
-            typename std::enable_if<is_lattice<T1>::value ||                \
+          typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\
-                                        is_lattice_expr<T1>::value,         \
+inline auto op(const T1 &lhs,const T2&rhs) \
-                                    T1>::type * = nullptr>                  \
+  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-  inline auto op(const T1 &arg)                                             \
+											    std::forward_as_tuple(lhs, rhs)))) \
-      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
+{\
-          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
+ return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
+									  std::forward_as_tuple(lhs, rhs))); \
-        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
+}
  }
-#define GRID_BINOP_LEFT(op, name)                                             \
+#define GRID_BINOP_RIGHT(op, name)\
-  template <typename T1, typename T2,                                         \
+ template <typename T1,typename T2,\
-            typename std::enable_if<is_lattice<T1>::value ||                  \
+           typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\
-                                        is_lattice_expr<T1>::value,           \
+           typename std::enable_if< is_lattice<T2>::value ||  is_lattice_expr<T2>::value, T2>::type* = nullptr> \
-                                    T1>::type * = nullptr>                    \
+inline auto op(const T1 &lhs,const T2&rhs)			\
-  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+  -> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-      ->decltype(                                                             \
+											    std::forward_as_tuple(lhs, rhs)))) \
-          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+{\
-              std::make_pair(GRID_BINOP(name)(),                              \
+ return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\
-                             std::forward_as_tuple(lhs, rhs)))) {             \
+								          std::forward_as_tuple(lhs, rhs))); \
-    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+}
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
-#define GRID_BINOP_RIGHT(op, name)                                            \
+#define GRID_DEF_BINOP(op, name)\
-  template <typename T1, typename T2,                                         \
+ GRID_BINOP_LEFT(op,name);\
-            typename std::enable_if<!is_lattice<T1>::value &&                 \
+ GRID_BINOP_RIGHT(op,name);
                                        !is_lattice_expr<T1>::value,          \
                                    T1>::type * = nullptr,                    \
            typename std::enable_if<is_lattice<T2>::value ||                  \
                                        is_lattice_expr<T2>::value,           \
                                    T2>::type * = nullptr>                    \
  inline auto op(const T1 &lhs, const T2 &rhs)                                \
      ->decltype(                                                             \
          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
              std::make_pair(GRID_BINOP(name)(),                              \
                             std::forward_as_tuple(lhs, rhs)))) {             \
    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
  }
 #define GRID_DEF_BINOP(op, name) \
  GRID_BINOP_LEFT(op, name);     \
  GRID_BINOP_RIGHT(op, name);
-#define GRID_DEF_TRINOP(op, name)                                              \
+#define GRID_DEF_TRINOP(op, name)\
-  template <typename T1, typename T2, typename T3>                             \
+template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \
-  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
+  -> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\
-      ->decltype(                                                              \
+										   std::forward_as_tuple(pred,lhs,rhs)))) \
-          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
+{\
-                                   const T3 &>(std::make_pair(                 \
+  return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \
-              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
+										 std::forward_as_tuple(pred,lhs, rhs))); \
-    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
+}
                                    const T3 &>(std::make_pair(                \
        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
  }
 ////////////////////////
-// Operator definitions
+//Operator definitions
 ////////////////////////
-GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(operator -,UnarySub);
-GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(Not,UnaryNot);
-GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(operator !,UnaryNot);
-GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(adj,UnaryAdj);
-GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(conjugate,UnaryConj);
-GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(trace,UnaryTrace);
-GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(transpose,UnaryTranspose);
-GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(Ta,UnaryTa);
-GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(ProjectOnGroup,UnaryProjectOnGroup);
-GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(real,UnaryReal);
-GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(imag,UnaryImag);
-GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toReal,UnaryToReal);
-GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(toComplex,UnaryToComplex);
-GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
-GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(sqrt ,UnarySqrt);
-GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+GRID_DEF_UNOP(rsqrt,UnaryRsqrt);
-                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sin  ,UnarySin);
-GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(cos  ,UnaryCos);
-GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(log  ,UnaryLog);
-GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(exp  ,UnaryExp);
 GRID_DEF_UNOP(cos, UnaryCos);
 GRID_DEF_UNOP(asin, UnaryAsin);
 GRID_DEF_UNOP(acos, UnaryAcos);
 GRID_DEF_UNOP(log, UnaryLog);
 GRID_DEF_UNOP(exp, UnaryExp);
-GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator+,BinaryAdd);
-GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator-,BinarySub);
-GRID_DEF_BINOP(operator*, BinaryMul);
+GRID_DEF_BINOP(operator*,BinaryMul);
-GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator&,BinaryAnd);
-GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator|,BinaryOr);
-GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator&&,BinaryAndAnd);
-GRID_DEF_BINOP(operator||, BinaryOrOr);
+GRID_DEF_BINOP(operator||,BinaryOrOr);
-GRID_DEF_TRINOP(where, TrinaryWhere);
+GRID_DEF_TRINOP(where,TrinaryWhere);
 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
-template <class Op, class T1>
+template<class Op,class T1>
-auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
-    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
+  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
-  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
+{
-      expr);
+  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
  return ret;
 }
-template <class Op, class T1, class T2>
+template<class Op,class T1, class T2>
-auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
-    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-                                        eval(0, std::get<1>(expr.second))))> {
+				      eval(0,std::get<1>(expr.second))))>
-  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+{
-                                   eval(0, std::get<1>(expr.second))))>
+  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-      ret(expr);
+				   eval(0,std::get<1>(expr.second))))> ret(expr);
  return ret;
 }
-template <class Op, class T1, class T2, class T3>
+template<class Op,class T1, class T2, class T3>
-auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-                                        eval(0, std::get<1>(expr.second)),
+				      eval(0,std::get<1>(expr.second)),
-                                        eval(0, std::get<2>(expr.second))))> {
+				      eval(0,std::get<2>(expr.second))))>
-  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+{
-                                   eval(0, std::get<1>(expr.second)),
+  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-                                   eval(0, std::get<2>(expr.second))))>
+				   eval(0,std::get<1>(expr.second)),
-      ret(expr);
+				   eval(0,std::get<2>(expr.second))))> ret(expr);
  return ret;
 }
@ -431,11 +382,12 @@ auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
 #undef GRID_DEF_UNOP
 #undef GRID_DEF_BINOP
 #undef GRID_DEF_TRINOP
 }
 #if 0
 using namespace Grid;
-        
+ 	      
 int main(int argc,char **argv){
   Lattice<double> v1(16);
@ -445,7 +397,7 @@ using namespace Grid;
   BinaryAdd<double,double> tmp;
   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
     expr(std::make_pair(tmp,
-    std::forward_as_tuple(v1,v2)));
+	  std::forward_as_tuple(v1,v2)));
   tmp.func(eval(0,v1),eval(0,v2));
   auto var = v1+v2;
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@ -1,33 +1,32 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/lattice/Lattice_base.h
+    Source file: ./lib/lattice/Lattice_base.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #ifndef GRID_LATTICE_BASE_H
 #define GRID_LATTICE_BASE_H
@ -65,6 +64,9 @@ public:
 class LatticeExpressionBase {};
 template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
@ -99,7 +101,6 @@ public:
    int begin(void) { return 0;};
    int end(void)   { return _odata.size(); }
    vobj & operator[](int i) { return _odata[i]; };
    const vobj & operator[](int i) const { return _odata[i]; };
 public:
    typedef typename vobj::scalar_type scalar_type;
@ -254,18 +255,6 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }
    Lattice(const Lattice& r){ // copy constructor
    	_grid = r._grid;
    	checkerboard = r.checkerboard;
    	_odata.resize(_grid->oSites());// essential
  		PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            _odata[ss]=r._odata[ss];
        }  	
    }
    virtual ~Lattice(void) = default;
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
@ -278,7 +267,7 @@ PARALLEL_FOR_LOOP
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-      
+      std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
@ -335,27 +324,27 @@ PARALLEL_FOR_LOOP
-#include "Lattice_conformable.h"
+#include <lattice/Lattice_conformable.h>
 #define GRID_LATTICE_EXPRESSION_TEMPLATES
 #ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
-#include "Lattice_ET.h"
+#include <lattice/Lattice_ET.h>
 #else 
-#include "Lattice_overload.h"
+#include <lattice/Lattice_overload.h>
 #endif
-#include "Lattice_arith.h"
+#include <lattice/Lattice_arith.h>
-#include "Lattice_trace.h"
+#include <lattice/Lattice_trace.h>
-#include "Lattice_transpose.h"
+#include <lattice/Lattice_transpose.h>
-#include "Lattice_local.h"
+#include <lattice/Lattice_local.h>
-#include "Lattice_reduction.h"
+#include <lattice/Lattice_reduction.h>
-#include "Lattice_peekpoke.h"
+#include <lattice/Lattice_peekpoke.h>
-#include "Lattice_reality.h"
+#include <lattice/Lattice_reality.h>
-#include "Lattice_comparison_utils.h"
+#include <lattice/Lattice_comparison_utils.h>
-#include "Lattice_comparison.h"
+#include <lattice/Lattice_comparison.h>
-#include "Lattice_coordinate.h"
+#include <lattice/Lattice_coordinate.h>
-#include "Lattice_where.h"
+#include <lattice/Lattice_where.h>
-#include "Lattice_rng.h"
+#include <lattice/Lattice_rng.h>
-#include "Lattice_unary.h"
+#include <lattice/Lattice_unary.h>
-#include "Lattice_transfer.h"
+#include <lattice/Lattice_transfer.h>
 #endif
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@ -154,7 +154,7 @@ PARALLEL_FOR_LOOP
    template<class vobj,class sobj>
    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
-      GridBase *grid = l._grid;
+      GridBase *grid=l._grid;
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
@ -164,18 +164,16 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);
-      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      std::vector<sobj> buf(Nsimd);
-      scalar_type * pt = (scalar_type *)&s;
+
-      
+      extract(l._odata[odx],buf);
      for(int w=0;w<words;w++){
        pt[w] = vp[idx+w*Nsimd];
      }
      s = buf[idx];
      return;
    };
@ -192,17 +190,18 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);
-      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      std::vector<sobj> buf(Nsimd);
-      scalar_type * pt = (scalar_type *)&s;
+
      // extract-modify-merge cycle is easiest way and this is not perf critical
      extract(l._odata[odx],buf);
-      for(int w=0;w<words;w++){
+      buf[idx] = s;
-        vp[idx+w*Nsimd] = pt[w];
+
-      }
+      merge(l._odata[odx],buf);
      return;
    };
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -40,7 +40,7 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
    ComplexD nrm = innerProduct(arg,arg);
-    return std::real(nrm); 
+    return real(nrm); 
  }
    template<class vobj>
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@ -296,10 +296,9 @@ namespace Grid {
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	int l_idx=generator_idx(o_idx,i_idx);
-
+	
-	const int num_rand_seed=16;
+	std::vector<int> site_seeds(4);
-	std::vector<int> site_seeds(num_rand_seed);
+	for(int i=0;i<4;i++){
 	for(int i=0;i<site_seeds.size();i++){
 	  site_seeds[i]= ui(pseeder);
 	}
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }
-  //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
@ -386,7 +386,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }
  // the above should guarantee that the operations are local
-  //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@ -420,15 +420,15 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
  assert(hg->_processors[orthog]==1);
  int dl; dl = 0;
-    for(int d=0;d<nh;d++){
+  for(int d=0;d<nh;d++){
-      if ( d != orthog) {
+    if ( d != orthog) {
-	assert(lg->_processors[dl]  == hg->_processors[d]);
+      assert(lg->_processors[dl]  == hg->_processors[d]);
-	assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
-	dl++;
+      dl++;
    }
  }
  // the above should guarantee that the operations are local
-  //PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@ -446,79 +446,6 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 }
 template<class vobj>
 void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
  sobj s;
  GridBase *lg = lowDim._grid;
  GridBase *hg = higherDim._grid;
  int nl = lg->_ndimension;
  int nh = hg->_ndimension;
  assert(nl == nh);
  assert(orthog<nh);
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    assert(lg->_processors[d]  == hg->_processors[d]);
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
  // the above should guarantee that the operations are local
  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    if( lcoor[orthog] == slice_lo ) { 
      hcoor=lcoor;
      hcoor[orthog] = slice_hi;
      peekLocalSite(s,lowDim,lcoor);
      pokeLocalSite(s,higherDim,hcoor);
    }
  }
 }
 template<class vobj>
 void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
  sobj s;
  GridBase *lg = lowDim._grid;
  GridBase *hg = higherDim._grid;
  int nl = lg->_ndimension;
  int nh = hg->_ndimension;
  assert(nl == nh);
  assert(orthog<nh);
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    assert(lg->_processors[d]  == hg->_processors[d]);
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
  // the above should guarantee that the operations are local
  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    if( lcoor[orthog] == slice_lo ) { 
      hcoor=lcoor;
      hcoor[orthog] = slice_hi;
      peekLocalSite(s,higherDim,hcoor);
      pokeLocalSite(s,lowDim,lcoor);
    }
  }
 }
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
@ -555,96 +482,6 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 }
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
  typedef typename vobj::vector_type vtype;
  GridBase* in_grid = in._grid;
  out.resize(in_grid->lSites());
  int ndim = in_grid->Nd();
  int in_nsimd = vtype::Nsimd();
  std::vector<std::vector<int> > in_icoor(in_nsimd);
  for(int lane=0; lane < in_nsimd; lane++){
    in_icoor[lane].resize(ndim);
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
 PARALLEL_FOR_LOOP
  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);
    std::vector<int> in_ocoor(ndim);
    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
    std::vector<int> lcoor(in_grid->Nd());
    for(int lane=0; lane < in_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
      int lex;
      Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
      out_ptrs[lane] = &out[lex];
    }
    //Unpack into those ptrs
    const vobj & in_vobj = in._odata[in_oidx];
    extract1(in_vobj, out_ptrs, 0);
  }
 }
 //Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>
 void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  assert(out._grid->Nd() == in._grid->Nd());
  out.checkerboard = in.checkerboard;
  GridBase *in_grid=in._grid;
  GridBase *out_grid = out._grid;
  typedef typename VobjOut::scalar_object SobjOut;
  typedef typename VobjIn::scalar_object SobjIn;
  int ndim = out._grid->Nd();
  int out_nsimd = out_grid->Nsimd();
  std::vector<std::vector<int> > out_icoor(out_nsimd);
  for(int lane=0; lane < out_nsimd; lane++){
    out_icoor[lane].resize(ndim);
    out_grid->iCoorFromIindex(out_icoor[lane], lane);
  }
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
  PARALLEL_FOR_LOOP
  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
    std::vector<int> out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
    std::vector<SobjOut*> ptrs(out_nsimd);      
    std::vector<int> lcoor(out_grid->Nd());
    for(int lane=0; lane < out_nsimd; lane++){
      for(int mu=0;mu<ndim;mu++)
 	lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
      int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
      ptrs[lane] = &in_slex_conv[llex];
    }
    merge(out._odata[out_oidx], ptrs, 0);
  }
 }
 }
 #endif
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@ -194,22 +194,22 @@ class BinaryIO {
      std::vector<int> site({x,y,z,t});
-      if (grid->IsBoss()) {
+      if ( grid->IsBoss() ) {
-        fin.read((char *)&file_object, sizeof(file_object));
+	fin.read((char *)&file_object,sizeof(file_object));
-        bytes += sizeof(file_object);
+	bytes += sizeof(file_object);
-        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
+	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
+	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
+	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
+	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
-        munge(file_object, munged, csum);
+	munge(file_object,munged,csum);
      }
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@ -254,20 +254,20 @@ class BinaryIO {
      if ( grid->IsBoss() ) {
-  
+	
-  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-  // NB could gather an xstrip as an optimisation.
+	// NB could gather an xstrip as an optimisation.
-  fout.write((char *)&file_object,sizeof(file_object));
+	fout.write((char *)&file_object,sizeof(file_object));
-  bytes+=sizeof(file_object);
+	bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@ -305,15 +305,15 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if( rank == grid->ThisRank() ){
-  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-  parallel.GetState(saved,l_idx);
+	parallel.GetState(saved,l_idx);
      }
      grid->Broadcast(rank,(void *)&saved[0],bytes);
      if ( grid->IsBoss() ) {
-  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-  fout.write((char *)&saved[0],bytes);
+	fout.write((char *)&saved[0],bytes);
      }
    }
@ -355,14 +355,14 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if ( grid->IsBoss() ) {
-  fin.read((char *)&saved[0],bytes);
+	fin.read((char *)&saved[0],bytes);
-  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      }
      grid->Broadcast(0,(void *)&saved[0],bytes);
      if( rank == grid->ThisRank() ){
-  parallel.SetState(saved,l_idx);
+	parallel.SetState(saved,l_idx);
      }
    }
@ -415,15 +415,15 @@ class BinaryIO {
      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
-  range[d] = grid->_ldimensions[d];
+	range[d] = grid->_ldimensions[d];
-  start[d] = grid->_processor_coor[d]*range[d];
+	start[d] = grid->_processor_coor[d]*range[d];
-  ioproc[d]= grid->_processor_coor[d];
+	ioproc[d]= grid->_processor_coor[d];
      } else {
-  range[d] = grid->_gdimensions[d];
+	range[d] = grid->_gdimensions[d];
-  start[d] = 0;
+	start[d] = 0;
-  ioproc[d]= 0;
+	ioproc[d]= 0;
-  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
    }
@ -434,9 +434,9 @@ class BinaryIO {
      std::cout<< std::dec ;
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-  std::cout<< range[d];
+	std::cout<< range[d];
-  if( d< grid->_ndimension-1 ) 
+	if( d< grid->_ndimension-1 ) 
-    std::cout<< " x ";
+	  std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@ -463,7 +463,7 @@ class BinaryIO {
      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-  
+	
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@ -472,8 +472,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-  gsite[d] = tsite[d]+start[d];               // global site
+	gsite[d] = tsite[d]+start[d];               // global site
      }
      /////////////////////////
@ -487,29 +487,29 @@ class BinaryIO {
      // iorank reads from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-  
+	
-  fin.seekg(offset+g_idx*sizeof(fileObj));
+	fin.seekg(offset+g_idx*sizeof(fileObj));
-  fin.read((char *)&fileObj,sizeof(fileObj));
+	fin.read((char *)&fileObj,sizeof(fileObj));
-  bytes+=sizeof(fileObj);
+	bytes+=sizeof(fileObj);
-  
+	
-  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-  
+	
-  munge(fileObj,siteObj,csum);
+	munge(fileObj,siteObj,csum);
-      } 
+      }	
      // Possibly do transport through pt2pt 
      if ( rank != iorank ) { 
-  if ( (myrank == rank) || (myrank==iorank) ) {
+	if ( (myrank == rank) || (myrank==iorank) ) {
-    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-  }
+	}
      }
      // Poke at destination
      if ( myrank == rank ) {
-    pokeLocalSite(siteObj,Umu,lsite);
+	  pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
@ -520,7 +520,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@ -558,15 +558,15 @@ class BinaryIO {
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
      if (parallel[d]) {
-  range[d] = grid->_ldimensions[d];
+	range[d] = grid->_ldimensions[d];
-  start[d] = grid->_processor_coor[d]*range[d];
+	start[d] = grid->_processor_coor[d]*range[d];
-  ioproc[d]= grid->_processor_coor[d];
+	ioproc[d]= grid->_processor_coor[d];
      } else {
-  range[d] = grid->_gdimensions[d];
+	range[d] = grid->_gdimensions[d];
-  start[d] = 0;
+	start[d] = 0;
-  ioproc[d]= 0;
+	ioproc[d]= 0;
-  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
@ -577,9 +577,9 @@ class BinaryIO {
      grid->GlobalSum(tmp);
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-  std::cout<< range[d];
+	std::cout<< range[d];
-  if( d< grid->_ndimension-1 ) 
+	if( d< grid->_ndimension-1 ) 
-    std::cout<< " x ";
+	  std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@ -610,7 +610,7 @@ class BinaryIO {
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-  
+	
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@ -619,8 +619,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-  gsite[d] = tsite[d]+start[d];               // global site
+	gsite[d] = tsite[d]+start[d];               // global site
      }
@ -640,26 +640,26 @@ class BinaryIO {
      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
-  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-    // Send to IOrank 
+	  // Send to IOrank 
-    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-  }
+	}
      }
      grid->Barrier(); // necessary?
      if (myrank == iorank) {
-  
+	
-  munge(siteObj,fileObj,csum);
+	munge(siteObj,fileObj,csum);
-  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-  
+	
-  fout.seekp(offset+g_idx*sizeof(fileObj));
+	fout.seekp(offset+g_idx*sizeof(fileObj));
-  fout.write((char *)&fileObj,sizeof(fileObj));
+	fout.write((char *)&fileObj,sizeof(fileObj));
-  bytes+=sizeof(fileObj);
+	bytes+=sizeof(fileObj);
      }
    }
@ -668,7 +668,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
--- a/lib/pugixml/.dirstamp
+++ b/lib/pugixml/.dirstamp
--- a/lib/pugixml/pugixml.h
+++ b/lib/pugixml/pugixml.h
@ -17,7 +17,7 @@
 #endif
 // Include user configuration file (this can define various configuration macros)
-#include "pugiconfig.hpp"
+#include <pugixml/pugiconfig.hpp>
 #ifndef HEADER_PUGIXML_HPP
 #define HEADER_PUGIXML_HPP
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@ -55,19 +55,10 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
    // note: static const int or constexpr will work for type deductions
    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    #define ColourIndex  2
+    static const int ColourIndex = 2;
-    #define SpinIndex    1
+    static const int SpinIndex   = 1;
-    #define LorentzIndex 0
+    static const int LorentzIndex= 0;
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
    static const int InverseNo=0;
    static const int InverseYes=1;
    // Useful traits is this a spin index
    //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
@ -493,27 +484,16 @@ namespace QCD {
 }   //namespace QCD
 } // Grid
-
+#include <qcd/utils/SpaceTimeGrid.h>
-#include <Grid/qcd/utils/SpaceTimeGrid.h>
+#include <qcd/spin/Dirac.h>
-#include <Grid/qcd/spin/Dirac.h>
+#include <qcd/spin/TwoSpinor.h>
-#include <Grid/qcd/spin/TwoSpinor.h>
+#include <qcd/utils/LinalgUtils.h>
-#include <Grid/qcd/utils/LinalgUtils.h>
+#include <qcd/utils/CovariantCshift.h>
-#include <Grid/qcd/utils/CovariantCshift.h>
+#include <qcd/utils/SUn.h>
-
+#include <qcd/action/Actions.h>
-// Include representations 	
+#include <qcd/hmc/integrators/Integrator.h>
-#include <Grid/qcd/utils/SUn.h>
+#include <qcd/hmc/integrators/Integrator_algorithm.h>
-#include <Grid/qcd/utils/SUnAdjoint.h>
+#include <qcd/hmc/HMC.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>
 #include <Grid/qcd/representations/hmc_types.h>
 #include <Grid/qcd/action/Actions.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/qcd/hmc/integrators/Integrator.h>
 #include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
 #include <Grid/qcd/hmc/HMC.h>
 #endif
--- a/lib/qcd/action/ActionBase.h
+++ b/lib/qcd/action/ActionBase.h
@ -1,153 +1,86 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/qcd/action/ActionBase.h
+    Source file: ./lib/qcd/action/ActionBase.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #ifndef QCD_ACTION_BASE
 #define QCD_ACTION_BASE
 namespace Grid {
-namespace QCD {
+namespace QCD{
 template<class GaugeField>
 class Action { 
 template <class GaugeField>
 class Action {
 public:
  bool is_smeared = false;
  // Boundary conditions? // Heatbath?
-  virtual void refresh(const GaugeField& U,
+  virtual void  refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions
-                       GridParallelRNG& pRNG) = 0;  // refresh pseudofermions
+  virtual RealD S    (const GaugeField &U)                        = 0;  // evaluate the action
-  virtual RealD S(const GaugeField& U) = 0;         // evaluate the action
+  virtual void  deriv(const GaugeField &U,GaugeField & dSdU )     = 0;  // evaluate the action derivative
-  virtual void deriv(const GaugeField& U,
+  virtual ~Action() {};
                     GaugeField& dSdU) = 0;  // evaluate the action derivative
  virtual ~Action(){};
 };
 // Indexing of tuple types
 template <class T, class Tuple>
 struct Index;
 template <class T, class... Types>
 struct Index<T, std::tuple<T, Types...>> {
  static const std::size_t value = 0;
 };
 template <class T, class U, class... Types>
 struct Index<T, std::tuple<U, Types...>> {
  static const std::size_t value = 1 + Index<T, std::tuple<Types...>>::value;
 };
 // Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh
 /*
-template <class GaugeField>
+template<class GaugeField, class FermionField>
-struct ActionLevel {
+class PseudoFermionAction : public Action<GaugeField> {
 public:
-  typedef Action<GaugeField>*
+  FermionField Phi;
-      ActPtr;  // now force the same colours as the rest of the code
+  GridParallelRNG &pRNG;
  GridBase &Grid;
-  //Add supported representations here
+  PseudoFermionAction(GridBase &_Grid,GridParallelRNG &_pRNG) : Grid(_Grid), Phi(&_Grid), pRNG(_pRNG) {
-
+  };
-
+
-  unsigned int multiplier;
+  virtual void refresh(const GaugeField &gauge) {
-
+    gaussian(Phi,pRNG);
  std::vector<ActPtr> actions;
  ActionLevel(unsigned int mul = 1) : actions(0), multiplier(mul) {
    assert(mul >= 1);
  };
  void push_back(ActPtr ptr) { actions.push_back(ptr); }
 };
 */
-template <class GaugeField, class Repr = NoHirep >
+template<class GaugeField> struct ActionLevel{
-struct ActionLevel {
+public:
- public:
+   
-  unsigned int multiplier; 
+  typedef Action<GaugeField>*  ActPtr; // now force the same colours as the rest of the code
-  // Fundamental repr actions separated because of the smearing
+  int multiplier;
  typedef Action<GaugeField>* ActPtr;
-  // construct a tuple of vectors of the actions for the corresponding higher
+  std::vector<ActPtr> actions;
  // representation fields
  typedef typename AccessTypes<Action, Repr>::VectorCollection action_collection;
  action_collection actions_hirep;
  typedef typename  AccessTypes<Action, Repr>::FieldTypeCollection action_hirep_types;
-  std::vector<ActPtr>& actions;
+  ActionLevel(int mul = 1) : multiplier(mul) {
-
+    assert (mul > 0);
  // Temporary conversion between ActionLevel and ActionLevelHirep
  //ActionLevelHirep(ActionLevel<GaugeField>& AL ):actions(AL.actions), multiplier(AL.multiplier){}
  ActionLevel(unsigned int mul = 1) : actions(std::get<0>(actions_hirep)), multiplier(mul) {
    // initialize the hirep vectors to zero.
    //apply(this->resize, actions_hirep, 0); //need a working resize
    assert(mul >= 1);
  };
-
+   
-  //void push_back(ActPtr ptr) { actions.push_back(ptr); }
+  void push_back(ActPtr ptr){
-
+    actions.push_back(ptr);
  template < class Field >
  void push_back(Action<Field>* ptr) {
    // insert only in the correct vector
    std::get< Index < Field, action_hirep_types>::value >(actions_hirep).push_back(ptr);
  };
  template < class ActPtr>
  static void resize(ActPtr ap, unsigned int n){
    ap->resize(n);
  }
  //template <std::size_t I>
  //auto getRepresentation(Repr& R)->decltype(std::get<I>(R).U)  {return std::get<I>(R).U;}
  // Loop on tuple for a callable function
  template <std::size_t I = 1, typename Callable, typename ...Args>
  inline typename std::enable_if<I == std::tuple_size<action_collection>::value, void>::type apply(
      Callable, Repr& R,Args&...) const {}
  template <std::size_t I = 1, typename Callable, typename ...Args>
  inline typename std::enable_if<I < std::tuple_size<action_collection>::value, void>::type apply(
      Callable fn, Repr& R, Args&... arguments) const {
    fn(std::get<I>(actions_hirep), std::get<I>(R.rep), arguments...);
    apply<I + 1>(fn, R, arguments...);
  }  
 };
 template<class GaugeField> using ActionSet = std::vector<ActionLevel< GaugeField > >;
 //template <class GaugeField>
 //using ActionSet = std::vector<ActionLevel<GaugeField> >;
-template <class GaugeField, class R>
+}}
 using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
 }
 }
 #endif
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 // Abstract base interface
 ////////////////////////////////////////////
-#include <Grid/qcd/action/ActionBase.h>
+#include <qcd/action/ActionBase.h>
-#include <Grid/qcd/action/ActionParams.h>
+#include <qcd/action/ActionParams.h>
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
-#include <Grid/qcd/action/gauge/GaugeImpl.h>
+#include <qcd/action/gauge/GaugeImpl.h>
-#include <Grid/qcd/utils/WilsonLoops.h>
+#include <qcd/utils/WilsonLoops.h>
-#include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+#include <qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
-#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
+#include <qcd/action/fermion/FermionOperatorImpl.h>
-#include <Grid/qcd/action/fermion/FermionOperator.h>
+#include <qcd/action/fermion/FermionOperator.h>
-#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
-#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
+#include <qcd/action/gauge/WilsonGaugeAction.h>
-#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
+#include <qcd/action/gauge/PlaqPlusRectangleAction.h>
 namespace Grid {
 namespace QCD {
@ -107,64 +107,41 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-
+#define FermOpTemplateInstantiate(A) \
 #define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
  template class A<ZWilsonImplF>;		\
  template class A<ZWilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		
 #define AdjointFermOpTemplateInstantiate(A) \
  template class A<WilsonAdjImplF>; \
  template class A<WilsonAdjImplD>; 
 #define TwoIndexFermOpTemplateInstantiate(A) \
  template class A<WilsonTwoIndexSymmetricImplF>; \
  template class A<WilsonTwoIndexSymmetricImplD>; 
 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
  template class A<DomainWallVec5dImplD>;	\
  template class A<ZDomainWallVec5dImplF>;	\
  template class A<ZDomainWallVec5dImplD>;	
 #define FermOpTemplateInstantiate(A) \
 FermOp4dVecTemplateInstantiate(A) \
 FermOp5dVecTemplateInstantiate(A) 
 #define GparityFermOpTemplateInstantiate(A) 
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+#include <qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+#include <qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
-#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
+#include <qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
-//#include <Grid/qcd/action/fermion/CloverFermion.h>
+//#include <qcd/action/fermion/CloverFermion.h>
-#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
+#include <qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
-#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <qcd/action/fermion/DomainWallFermion.h>
-#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <qcd/action/fermion/DomainWallFermion.h>
-#include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <qcd/action/fermion/MobiusFermion.h>
-#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
+#include <qcd/action/fermion/ScaledShamirFermion.h>
-#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
+#include <qcd/action/fermion/MobiusZolotarevFermion.h>
-#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
+#include <qcd/action/fermion/ShamirZolotarevFermion.h>
-#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
+#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
-#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
+#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
-#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
+#include <qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction
-#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
+#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
-#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
+#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
-#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
+#include <qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
-#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
+#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
-#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@ -180,14 +157,6 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
 typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
 typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@ -198,11 +167,6 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@ -258,21 +222,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/g5HermitianLinop.h>
+#include <qcd/action/fermion/g5HermitianLinop.h>
 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
-#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
+#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
-#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
+#include <qcd/action/pseudofermion/TwoFlavour.h>
-#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
+#include <qcd/action/pseudofermion/TwoFlavourRatio.h>
-#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
+#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
-#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
+#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
-#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
+#include <qcd/action/pseudofermion/OneFlavourRational.h>
-#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
+#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h>
-#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
+#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
-#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
+#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -28,10 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
@ -48,352 +45,486 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- { }
+ {
 }
-template<class Impl>  
+ template<class Impl>
-void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
+  void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
-{
+  {
-  int Ls=this->Ls;
+    // Assemble Din
-  std::vector<Coeff_t> diag (Ls,1.0);
+    int Ls=this->Ls;
-  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+    for(int s=0;s<Ls;s++){
-  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
+      if ( s==0 ) {
-  M5D(psi,chi,chi,lower,diag,upper);
+	//	Din = bs psi[s] + cs[s] psi[s+1}
-}
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-template<class Impl>
+	//      Din+= -mass*cs[s] psi[s+1}
-void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
+	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-{
+      } else if ( s==(Ls-1)) { 
-  int Ls=this->Ls;
+	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-  std::vector<Coeff_t> diag = bs;
+	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
-  std::vector<Coeff_t> upper= cs;
+      } else {
-  std::vector<Coeff_t> lower= cs; 
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-  upper[Ls-1]=-mass*upper[Ls-1];
+	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
-  lower[0]   =-mass*lower[0];
+      }
-  M5D(psi,psi,Din,lower,diag,upper);
+    }
 }
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  std::vector<Coeff_t> diag = beo;
  std::vector<Coeff_t> upper(Ls);
  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
  }
-  upper[Ls-1]=-mass*upper[Ls-1];
+ template<class Impl>
-  lower[0]   =-mass*lower[0];
+  void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
-  M5D(psi,psi,chi,lower,diag,upper);
+  {
-}
+    int Ls=this->Ls;
-template<class Impl>
+    for(int s=0;s<Ls;s++){
-void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
+      if ( s==0 ) {
-{
+	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-  int Ls=this->Ls;
+	axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1);
-  std::vector<Coeff_t> diag = bee;
+      } else if ( s==(Ls-1)) { 
-  std::vector<Coeff_t> upper(Ls);
+	axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0);
-  std::vector<Coeff_t> lower(Ls);
+	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-  for(int i=0;i<Ls;i++) {
+      } else {
-    upper[i]=-cee[i];
+	axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
-    lower[i]=-cee[i];
+	axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
-  }
+      }
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  std::vector<Coeff_t> diag = bee;
  std::vector<Coeff_t> upper(Ls);
  std::vector<Coeff_t> lower(Ls);
  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
    if ( s==0 ) {
      upper[s] = -cee[s+1] ;
      lower[s] = mass*cee[Ls-1];
    } else if ( s==(Ls-1)) { 
      upper[s] = mass*cee[0];
      lower[s] = -cee[s-1];
    } else {
      upper[s]=-cee[s+1];
      lower[s]=-cee[s-1];
    }
  }
-  M5Ddag(psi,psi,chi,lower,diag,upper);
+  // override multiply
-}
+ template<class Impl>
  RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
-template<class Impl>
+    FermionField Din(psi._grid);
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  std::vector<Coeff_t> diag(Ls,1.0);
  std::vector<Coeff_t> upper(Ls,-1.0);
  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
 }
-template<class Impl>
+    // Assemble Din
-void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
+    /*
-{
+    for(int s=0;s<Ls;s++){
-  int Ls=this->Ls;
+      if ( s==0 ) {
-  std::vector<Coeff_t> diag =bs;
+	//	Din = bs psi[s] + cs[s] psi[s+1}
-  std::vector<Coeff_t> upper=cs;
+	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
-  std::vector<Coeff_t> lower=cs;
+	//      Din+= -mass*cs[s] psi[s+1}
-  upper[Ls-1]=-mass*upper[Ls-1];
+	axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
-  lower[0]   =-mass*lower[0];
+      } else if ( s==(Ls-1)) { 
-  M5Ddag(psi,psi,Din,lower,diag,upper);
+	axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
-}
+	axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
 	axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
      }
    }
    */
    Meooe5D(psi,Din);
-template<class Impl>
+    this->DW(Din,chi,DaggerNo);
-RealD CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi)
+    // ((b D_W + D_w hop terms +1) on s-diag
-{
+    axpby(chi,1.0,1.0,chi,psi); 
  int Ls=this->Ls;
  FermionField Din(psi._grid);
  // Assemble Din
  Meooe5D(psi,Din);
  this->DW(Din,chi,DaggerNo);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby(chi,1.0,1.0,chi,psi); 
  M5D(psi,chi);
  return(norm2(chi));
 }
-template<class Impl>
+    // Call Mooee??
-RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
+    for(int s=0;s<Ls;s++){
-{
+      if ( s==0 ){
-  // Under adjoint
+	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-  //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
+	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
-  //D2- P+     D2+            P-D1-^dag D2+dag
+      } else if ( s==(Ls-1)) {
-  
+	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
-  FermionField Din(psi._grid);
+	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-  // Apply Dw
+      } else {
-  this->DW(psi,Din,DaggerYes); 
+	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
-  
+	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
-  MeooeDag5D(Din,chi);
+      }
-  
+    }
-  M5Ddag(psi,chi);
+    return norm2(chi);
  // ((b D_W + D_w hop terms +1) on s-diag
  axpby (chi,1.0,1.0,chi,psi); 
  return norm2(chi);
 }
 // half checkerboard operations
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
  FermionField tmp(psi._grid);
  Meooe5D(psi,tmp); 
  if ( psi.checkerboard == Odd ) {
    this->DhopEO(tmp,chi,DaggerNo);
  } else {
    this->DhopOE(tmp,chi,DaggerNo);
  }
 }
-template<class Impl>
+ template<class Impl>
-void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
+  RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
-{
+  {
-  FermionField tmp(psi._grid);
+    // Under adjoint
-  // Apply 4d dslash
+    //D1+        D1- P-    ->   D1+^dag   P+ D2-^dag
-  if ( psi.checkerboard == Odd ) {
+    //D2- P+     D2+            P-D1-^dag D2+dag
    this->DhopEO(psi,tmp,DaggerYes);
  } else {
    this->DhopOE(psi,tmp,DaggerYes);
  }
  MeooeDag5D(tmp,chi); 
 }
-template<class Impl>
+    FermionField Din(psi._grid);
-void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
+    // Apply Dw
-  FermionField tmp(psi._grid);
+    this->DW(psi,Din,DaggerYes); 
-  Meo5D(psi,tmp);
+
-  // Apply 4d dslash fragment
+    MeooeDag5D(Din,chi);
-  this->DhopDir(tmp,chi,dir,disp);
+
-}
+    int Ls=this->Ls;
-// force terms; five routines; default to Dhop on diagonal
+    for(int s=0;s<Ls;s++){
-template<class Impl>
+
-void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+      // Collect the terms in DW
-{
+      //	Chi = bs Din[s] + cs[s] Din[s+1}
-  FermionField Din(V._grid);
+      //    Chi+= -mass*cs[s] psi[s+1}
-  
+      /*
-  if ( dag == DaggerNo ) {
+      if ( s==0 ) {
-    //      U d/du [D_w D5] V = U d/du DW D5 V
+	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
-    Meooe5D(V,Din);
+	axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
-    this->DhopDeriv(mat,U,Din,dag);
+      } else if ( s==(Ls-1)) { 
-  } else {
+	axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
-    Meooe5D(U,Din);
+      } else {
-    this->DhopDeriv(mat,Din,V,dag);
+	axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
      }
      */
      // FIXME just call MooeeDag??
      // Collect the terms indept of DW
      if ( s==0 ){
 	axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
      } else if ( s==(Ls-1)) {
 	axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
      } else {
 	axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
      }
    }
    // ((b D_W + D_w hop terms +1) on s-diag
    axpby (chi,1.0,1.0,chi,psi); 
    return norm2(chi);
  }
-};
+
-template<class Impl>
+  // half checkerboard operations
-void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+ template<class Impl>
-{
+  void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
-  FermionField Din(V._grid);
+  {
-  
+    int Ls=this->Ls;
-  if ( dag == DaggerNo ) {
+
-    //      U d/du [D_w D5] V = U d/du DW D5 V
+    FermionField tmp(psi._grid);
-    Meooe5D(V,Din);
+    // Assemble the 5d matrix
-    this->DhopDerivOE(mat,U,Din,dag);
+    Meooe5D(psi,tmp); 
-  } else {
+#if 0
-    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
+    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	tmp = bs psi[s] + cs[s] psi[s+1}
 	//      tmp+= -mass*cs[s] psi[s+1}
 	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
 	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
 	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
 	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      }
    }
    std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
 #endif
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
      this->DhopEO(tmp,chi,DaggerNo);
    } else {
      this->DhopOE(tmp,chi,DaggerNo);
    }
  }
  template<class Impl>
  void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
  {
    FermionField tmp(psi._grid);
    // Apply 4d dslash
    if ( psi.checkerboard == Odd ) {
      this->DhopEO(psi,tmp,DaggerYes);
    } else {
      this->DhopOE(psi,tmp,DaggerYes);
    }
    MeooeDag5D(tmp,chi); 
 #if 0
    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
    // Assemble the 5d matrix
    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pplus(chi,beo[s],tmp,   -ceo[s+1]  ,tmp,s,s+1);
 	axpby_ssp_pminus(chi,   1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
 	axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
      } else {
 	axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
 	axpby_ssp_pminus(chi,1.0   ,chi,-ceo[s-1],tmp,s,s-1);
      }
    }
    std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
 #endif
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      if ( s==0 ) {
 	axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
 	axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
 	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
 	axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
      }
    }
  }
 template<class Impl>
  void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
    int Ls=this->Ls;
    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	//	tmp = bs psi[s] + cs[s] psi[s+1}
 	//      tmp+= -mass*cs[s] psi[s+1}
 	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
 	axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
 	axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      } else {
 	axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
 	axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
      }
    }
    // Apply 4d dslash fragment
    this->DhopDir(tmp,chi,dir,disp);
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    for (int s=0;s<Ls;s++){
      // Assemble the 5d matrix
      if ( s==0 ) {
 	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1]  ,psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
      } else if ( s==(Ls-1)) { 
 	axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
 	axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
      } else {
 	axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
 	axpby_ssp_pminus(chi,1.0   ,chi,-cee[s-1],psi,s,s-1);
      }
    }
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    // Apply (L^{\prime})^{-1}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
      axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
    }
    // L_m^{-1} 
    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
      axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
    }
    // U_m^{-1} D^{-1}
    for (int s=0;s<Ls-1;s++){
      // Chi[s] + 1/d chi[s] 
      axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
    }	
    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
    // Apply U^{-1}
    for (int s=Ls-2;s>=0;s--){
      axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
    }
  }
 template<class Impl>
  void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
  {
    int Ls=this->Ls;
    // Apply (U^{\prime})^{-dagger}
    axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
    for (int s=1;s<Ls;s++){
      axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
    }
    // U_m^{-\dagger} 
    for (int s=0;s<Ls-1;s++){
      axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
    }
    // L_m^{-\dagger} D^{-dagger}
    for (int s=0;s<Ls-1;s++){
      axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
    }	
    axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
    // Apply L^{-dagger}
    for (int s=Ls-2;s>=0;s--){
      axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
    }
  }
  // force terms; five routines; default to Dhop on diagonal
  template<class Impl>
  void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDeriv(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDeriv(mat,Din,V,dag);
    }
  };
 template<class Impl>
  void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    FermionField Din(V._grid);
    if ( dag == DaggerNo ) {
      //      U d/du [D_w D5] V = U d/du DW D5 V
      Meooe5D(V,Din);
      this->DhopDerivOE(mat,U,Din,dag);
    } else {
      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
      Meooe5D(U,Din);
      this->DhopDerivOE(mat,Din,V,dag);
-  }
+    }
-};
+  };
-template<class Impl>
+ template<class Impl>
-void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
+  void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
-{
+  {
-  FermionField Din(V._grid);
+    FermionField Din(V._grid);
  if ( dag == DaggerNo ) {
    //      U d/du [D_w D5] V = U d/du DW D5 V
    Meooe5D(V,Din);
    this->DhopDerivEO(mat,U,Din,dag);
  } else {
    //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
    Meooe5D(U,Din);
    this->DhopDerivEO(mat,Din,V,dag);
  }
 };
 // Tanh
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;
-  ///////////////////////////////////////////////////////////
+    if ( dag == DaggerNo ) {
-  // The Cayley coeffs (unprec)
+      //      U d/du [D_w D5] V = U d/du DW D5 V
-  ///////////////////////////////////////////////////////////
+      Meooe5D(V,Din);
-  omega.resize(Ls);
+      this->DhopDerivEO(mat,U,Din,dag);
-  bs.resize(Ls);
+    } else {
-  cs.resize(Ls);
+      //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
-  as.resize(Ls);
+      Meooe5D(U,Din);
      this->DhopDerivEO(mat,Din,V,dag);
    }
  };
-  // 
+  // Tanh
-  // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
+ template<class Impl>
-  //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
+  void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
-  //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
+  {
-  //
+    SetCoefficientsZolotarev(1.0,zdata,b,c);
-  //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
+
  //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
  //
  // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
  // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
  //
  // So 
  //
  // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
  //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
  //
  // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
  //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
  //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
  // 
  double bpc = b+c;
  double bmc = b-c;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
-  
+  //Zolo
-  ////////////////////////////////////////////////////////
+ template<class Impl>
-  // Constants for the preconditioned matrix Cayley form
+  void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
-  ////////////////////////////////////////////////////////
+  {
-  bee.resize(Ls);
+    int Ls=this->Ls;
-  cee.resize(Ls);
+
-  beo.resize(Ls);
+    ///////////////////////////////////////////////////////////
-  ceo.resize(Ls);
+    // The Cayley coeffs (unprec)
-  
+    ///////////////////////////////////////////////////////////
-  for(int i=0;i<Ls;i++){
+    omega.resize(Ls);
-    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
+    bs.resize(Ls);
-    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
+    cs.resize(Ls);
-    beo[i]=as[i]*bs[i];
+    as.resize(Ls);
    ceo[i]=-as[i]*cs[i];
  }
  aee.resize(Ls);
  aeo.resize(Ls);
  for(int i=0;i<Ls;i++){
    aee[i]=cee[i];
    aeo[i]=ceo[i];
  }
  //////////////////////////////////////////
  // LDU decomposition of eeoo
  //////////////////////////////////////////
  dee.resize(Ls);
  lee.resize(Ls);
  leem.resize(Ls);
  uee.resize(Ls);
  ueem.resize(Ls);
  for(int i=0;i<Ls;i++){
-    dee[i] = bee[i];
+    // 
    // Ts = (    [bs+cs]Dw        )^-1 (    (bs+cs) Dw         )
    //     -(g5  -------       -1 )    ( g5 ---------     + 1  )
    //      (   {2+(bs-cs)Dw}     )    (    2+(bs-cs) Dw       )
    //
    //  bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2(  1/omega(b+c) + (b-c) )
    //  cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2(  1/omega(b+c) - (b-c) )
    //
    // bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
    // bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
    //
    // So 
    //
    // Ts = (    [b+c]Dw/omega_s    )^-1 (    (b+c) Dw /omega_s        )
    //     -(g5  -------         -1 )    ( g5 ---------           + 1  )
    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
    //
    // Ts = (    [b+c]Dw            )^-1 (    (b+c) Dw                 )
    //     -(g5  -------    -omega_s)    ( g5 ---------      + omega_s )
    //      (   {2+(b-c)Dw}         )    (    2+(b-c) Dw               )
    // 
-    if ( i < Ls-1 ) {
+    double bpc = b+c;
    double bmc = b-c;
    for(int i=0; i < Ls; i++){
      as[i] = 1.0;
      omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
      bs[i] = 0.5*(bpc/omega[i] + bmc);
      cs[i] = 0.5*(bpc/omega[i] - bmc);
    }
    ////////////////////////////////////////////////////////
    // Constants for the preconditioned matrix Cayley form
    ////////////////////////////////////////////////////////
    bee.resize(Ls);
    cee.resize(Ls);
    beo.resize(Ls);
    ceo.resize(Ls);
    for(int i=0;i<Ls;i++){
      bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
      cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
      beo[i]=as[i]*bs[i];
      ceo[i]=-as[i]*cs[i];
    }
    aee.resize(Ls);
    aeo.resize(Ls);
    for(int i=0;i<Ls;i++){
      aee[i]=cee[i];
      aeo[i]=ceo[i];
    }
    //////////////////////////////////////////
    // LDU decomposition of eeoo
    //////////////////////////////////////////
    dee.resize(Ls);
    lee.resize(Ls);
    leem.resize(Ls);
    uee.resize(Ls);
    ueem.resize(Ls);
    for(int i=0;i<Ls;i++){
-      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
+      dee[i] = bee[i];
-      leem[i]=mass*cee[Ls-1]/bee[0];
+      if ( i < Ls-1 ) {
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+	
-      
+	lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
-      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
+	    
-      
+	leem[i]=mass*cee[Ls-1]/bee[0];
-      ueem[i]=mass;
+	for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
-      for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
+	
-      ueem[i]*= aee[0]/bee[0];
+	uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
-      
+	
-    } else { 
+	ueem[i]=mass;
-      lee[i] =0.0;
+	for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
-      leem[i]=0.0;
+	ueem[i]*= aee[0]/bee[0];
-      uee[i] =0.0;
+	    
-      ueem[i]=0.0;
+      } else { 
 	lee[i] =0.0;
 	leem[i]=0.0;
 	uee[i] =0.0;
 	ueem[i]=0.0;
      }
    }
    { 
      double delta_d=mass*cee[Ls-1];
      for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
      dee[Ls-1] += delta_d;
    }
  }
  { 
    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
 }
  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -51,29 +51,6 @@ namespace Grid {
      virtual void   MooeeDag    (const FermionField &in, FermionField &out);
      virtual void   MooeeInv    (const FermionField &in, FermionField &out);
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
      virtual void   Meo5D (const FermionField &psi, FermionField &chi);
      virtual void   M5D   (const FermionField &psi, FermionField &chi);
      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
      /////////////////////////////////////////////////////
      // Instantiate different versions depending on Impl
      /////////////////////////////////////////////////////
      void M5D(const FermionField &psi,
 	       const FermionField &phi, 
 	       FermionField &chi,
 	       std::vector<Coeff_t> &lower,
 	       std::vector<Coeff_t> &diag,
 	       std::vector<Coeff_t> &upper);
      void M5Ddag(const FermionField &psi,
 		  const FermionField &phi, 
 		  FermionField &chi,
 		  std::vector<Coeff_t> &lower,
 		  std::vector<Coeff_t> &diag,
 		  std::vector<Coeff_t> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
      virtual void   Instantiatable(void)=0;
      // force terms; five routines; default to Dhop on diagonal
@ -91,23 +68,23 @@ namespace Grid {
      RealD mass;
      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<Coeff_t> omega; 
+      std::vector<RealD> omega; 
-      std::vector<Coeff_t> bs;    // S dependent coeffs
+      std::vector<RealD> bs;    // S dependent coeffs
-      std::vector<Coeff_t> cs;    
+      std::vector<RealD> cs;    
-      std::vector<Coeff_t> as;    
+      std::vector<RealD> as;    
      // For preconditioning Cayley form
-      std::vector<Coeff_t> bee;    
+      std::vector<RealD> bee;    
-      std::vector<Coeff_t> cee;    
+      std::vector<RealD> cee;    
-      std::vector<Coeff_t> aee;    
+      std::vector<RealD> aee;    
-      std::vector<Coeff_t> beo;    
+      std::vector<RealD> beo;    
-      std::vector<Coeff_t> ceo;    
+      std::vector<RealD> ceo;    
-      std::vector<Coeff_t> aeo;    
+      std::vector<RealD> aeo;    
      // LDU factorisation of the eeoo matrix
-      std::vector<Coeff_t> lee;    
+      std::vector<RealD> lee;    
-      std::vector<Coeff_t> leem;    
+      std::vector<RealD> leem;    
-      std::vector<Coeff_t> uee;    
+      std::vector<RealD> uee;    
-      std::vector<Coeff_t> ueem;    
+      std::vector<RealD> ueem;    
-      std::vector<Coeff_t> dee;    
+      std::vector<RealD> dee;    
      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
@ -120,20 +97,9 @@ namespace Grid {
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };
  }
 }
 #define INSTANTIATE_DPERP(A)\
 template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
 					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
 					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
 #define CAYLEY_DPERP_CACHE
 #undef  CAYLEY_DPERP_LINALG
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -1,211 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
  // Pminus fowards
  // Pplus  backwards..
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
 				std::vector<Coeff_t> &lower,
 				std::vector<Coeff_t> &diag,
 				std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
      auto tmp = psi._odata[0];
      if ( s==0 ) {
 	                            spProj5m(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	                    spProj5p(tmp,psi._odata[ss+Ls-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else if ( s==(Ls-1)) {
 	                            spProj5m(tmp,psi._odata[ss+0]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	                    spProj5p(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else { 
 	                            spProj5m(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	                    spProj5p(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      }
    }
  }
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
 				   std::vector<Coeff_t> &lower,
 				   std::vector<Coeff_t> &diag,
 				   std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
 	spProj5p(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	spProj5m(tmp,psi._odata[ss+Ls-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else if ( s==(Ls-1)) {
 	spProj5p(tmp,psi._odata[ss+0]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	spProj5m(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      } else { 
 	spProj5p(tmp,psi._odata[ss+s+1]);
 	chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
 	spProj5m(tmp,psi._odata[ss+s-1]);
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
      }
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
 {
  GridBase *grid=psi._grid;
  int Ls=this->Ls;
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
                            spProj5p(tmp,chi[ss+s-1]);  
      chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
    }
    // L_m^{-1} 
    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
                                   spProj5m(tmp,chi[ss+s]);    
      chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
    }
    // U_m^{-1} D^{-1}
    for (int s=0;s<Ls-1;s++){
      // Chi[s] + 1/d chi[s] 
                                                spProj5p(tmp,chi[ss+Ls-1]); 
      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
    }	
    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
    // Apply U^{-1}
    for (int s=Ls-2;s>=0;s--){
                            spProj5m(tmp,chi[ss+s+1]);  
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  GridBase *grid=psi._grid;
  int Ls=this->Ls;
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // Apply (U^{\prime})^{-dagger}
    chi[ss]=psi[ss];
    for (int s=1;s<Ls;s++){
                            spProj5m(tmp,chi[ss+s-1]);
      chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
    }
    // U_m^{-\dagger} 
    for (int s=0;s<Ls-1;s++){
                                   spProj5p(tmp,chi[ss+s]);
      chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
    }
    // L_m^{-\dagger} D^{-dagger}
    for (int s=0;s<Ls-1;s++){
      spProj5m(tmp,chi[ss+Ls-1]);
      chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
    }	
    chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
    // Apply L^{-dagger}
    for (int s=Ls-2;s>=0;s--){
      spProj5p(tmp,chi[ss+s+1]);
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
 }
 #ifdef CAYLEY_DPERP_CACHE
  INSTANTIATE_DPERP(WilsonImplF);
  INSTANTIATE_DPERP(WilsonImplD);
  INSTANTIATE_DPERP(GparityWilsonImplF);
  INSTANTIATE_DPERP(GparityWilsonImplD);
  INSTANTIATE_DPERP(ZWilsonImplF);
  INSTANTIATE_DPERP(ZWilsonImplD);
 #endif
 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Ddense.cc
@ -1,133 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Eigen/Dense>
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  /*
   * Dense matrix versions of routines
   */
  /*
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
 }
  */
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
  int Ls=this->Ls;
  int LLs = psi._grid->_rdimensions[0];
  int vol = psi._grid->oSites()/LLs;
  chi.checkerboard=psi.checkerboard;
  assert(Ls==LLs);
  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXd PplusMat ;
  Eigen::MatrixXd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  // For the non-vectorised s-direction this is simple
  for(auto site=0;site<vol;site++){
    SiteSpinor     SiteChi;
    SiteHalfSpinor SitePplus;
    SiteHalfSpinor SitePminus;
    for(int s1=0;s1<Ls;s1++){
      SiteChi =zero;
      for(int s2=0;s2<Ls;s2++){
 	int lex2 = s2+Ls*site;
 	if ( PplusMat(s1,s2) != 0.0 ) {
 	  spProj5p(SitePplus,psi[lex2]);
 	  accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
 	}
 	if ( PminusMat(s1,s2) != 0.0 ) {
 	  spProj5m(SitePminus,psi[lex2]);
 	  accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
 	}
      }
      chi[s1+Ls*site] = SiteChi*0.5;
    }
  }
 }
 template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@ -1,149 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
  // Pminus fowards
  // Pplus  backwards
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
 				std::vector<Coeff_t> &lower,
 				std::vector<Coeff_t> &diag,
 				std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
    if ( s==0 ) {
      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
    } else if ( s==(Ls-1)) { 
      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
      axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
    } else {
      axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
    }
  }
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
 				   std::vector<Coeff_t> &lower,
 				   std::vector<Coeff_t> &diag,
 				   std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
    if ( s==0 ) {
      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
    } else if ( s==(Ls-1)) { 
      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
    } else {
      axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
      axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi)
 {
  chi.checkerboard=psi.checkerboard;
  int Ls=this->Ls;
  // Apply (L^{\prime})^{-1}
  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
  for (int s=1;s<Ls;s++){
    axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
  }
  // L_m^{-1} 
  for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
    axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
  }
  // U_m^{-1} D^{-1}
  for (int s=0;s<Ls-1;s++){
    // Chi[s] + 1/d chi[s] 
    axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
  }	
  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
  // Apply U^{-1}
  for (int s=Ls-2;s>=0;s--){
    axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls]
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  chi.checkerboard=psi.checkerboard;
  int Ls=this->Ls;
  // Apply (U^{\prime})^{-dagger}
  axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0]
  for (int s=1;s<Ls;s++){
    axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
  }
  // U_m^{-\dagger} 
  for (int s=0;s<Ls-1;s++){
    axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
  }
  // L_m^{-\dagger} D^{-dagger}
  for (int s=0;s<Ls-1;s++){
    axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
  }	
  axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable 
  // Apply L^{-dagger}
  for (int s=Ls-2;s>=0;s--){
    axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls]
  }
 }
 #ifdef CAYLEY_DPERP_LINALG
  INSTANTIATE(WilsonImplF);
  INSTANTIATE(WilsonImplD);
  INSTANTIATE(GparityWilsonImplF);
  INSTANTIATE(GparityWilsonImplD);
 #endif
 }
 }
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -1,309 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Eigen/Dense>
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
  /*
   * Dense matrix versions of routines
   */
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
 {
  this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
 				std::vector<Coeff_t> &lower,
 				std::vector<Coeff_t> &diag,
 				std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  int nsimd= Simd::Nsimd();
  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
  Vector<iSinglet<Simd> > d(LLs);
  assert(Ls/LLs==nsimd);
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // just directly address via type pun
  typedef typename Simd::scalar_type scalar_type;
  scalar_type * u_p = (scalar_type *)&u[0];
  scalar_type * l_p = (scalar_type *)&l[0];
  scalar_type * d_p = (scalar_type *)&d[0];
  for(int o=0;o<LLs;o++){ // outer
  for(int i=0;i<nsimd;i++){ //inner
    int s  = o+i*LLs;
    int ss = o*nsimd+i;
    u_p[ss] = upper[s];
    l_p[ss] = lower[s];
    d_p[ss] = diag[s];
  }}
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
    alignas(64) SiteSpinor fm;
    for(int v=0;v<LLs;v++){
      int vp=(v+1)%LLs;
      int vm=(v+LLs-1)%LLs;
      spProj5m(hp,psi[ss+vp]);
      spProj5p(hm,psi[ss+vm]);
      if ( vp<=v ) rotate(hp,hp,1);
      if ( vm>=v ) rotate(hm,hm,nsimd-1);
      hp=hp*0.5;
      hm=hm*0.5;
      spRecon5m(fp,hp);
      spRecon5p(fm,hm);
      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
  }
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
 				   std::vector<Coeff_t> &lower,
 				   std::vector<Coeff_t> &diag,
 				   std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
  int nsimd= Simd::Nsimd();
  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
  Vector<iSinglet<Simd> > d(LLs);
  assert(Ls/LLs==nsimd);
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // just directly address via type pun
  typedef typename Simd::scalar_type scalar_type;
  scalar_type * u_p = (scalar_type *)&u[0];
  scalar_type * l_p = (scalar_type *)&l[0];
  scalar_type * d_p = (scalar_type *)&d[0];
  for(int o=0;o<LLs;o++){ // outer
  for(int i=0;i<nsimd;i++){ //inner
    int s  = o+i*LLs;
    int ss = o*nsimd+i;
    u_p[ss] = upper[s];
    l_p[ss] = lower[s];
    d_p[ss] = diag[s];
  }}
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
    alignas(64) SiteSpinor fm;
    for(int v=0;v<LLs;v++){
      int vp=(v+1)%LLs;
      int vm=(v+LLs-1)%LLs;
      spProj5p(hp,psi[ss+vp]);
      spProj5m(hm,psi[ss+vm]);
      if ( vp<=v ) rotate(hp,hp,1);
      if ( vm>=v ) rotate(hm,hm,nsimd-1);
      hp=hp*0.5;
      hm=hm*0.5;
      spRecon5p(fp,hp);
      spRecon5m(fm,hm);
      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
  }
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
  int Ls=this->Ls;
  int LLs = psi._grid->_rdimensions[0];
  int vol = psi._grid->oSites()/LLs;
  chi.checkerboard=psi.checkerboard;
  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
    Pminus(s,s)= bee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pminus(s,s+1) = -cee[s];
  }
  for(int s=0;s<Ls-1;s++){
    Pplus(s+1,s) = -cee[s+1];
  }
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  Eigen::MatrixXcd PplusMat ;
  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
    PminusMat=Pminus.inverse();
  } else { 
    PplusMat =Pplus;
    PminusMat=Pminus;
  }
  if(dag){
    PplusMat.adjointInPlace();
    PminusMat.adjointInPlace();
  }
  typedef typename SiteHalfSpinor::scalar_type scalar_type;
  const int Nsimd=Simd::Nsimd();
  Vector<iSinglet<Simd> > Matp(Ls*LLs);
  Vector<iSinglet<Simd> > Matm(Ls*LLs);
  for(int s2=0;s2<Ls;s2++){
  for(int s1=0;s1<LLs;s1++){
    int istride = LLs;
    int ostride = 1;
      Simd Vp;
      Simd Vm;
      scalar_type *sp = (scalar_type *)&Vp;
      scalar_type *sm = (scalar_type *)&Vm;
      for(int l=0;l<Nsimd;l++){
 	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
 	sm[l] = PminusMat(l*istride+s1*ostride,s2);
      }
      Matp[LLs*s2+s1] = Vp;
      Matm[LLs*s2+s1] = Vm;
    }
  }
  // Dynamic allocate on stack to get per thread without serialised heap acces
 PARALLEL_FOR_LOOP
  for(auto site=0;site<vol;site++){
    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
    Vector<SiteHalfSpinor> SiteChiP(LLs);
    Vector<SiteHalfSpinor> SiteChiM(LLs);
    Vector<SiteSpinor>     SiteChi(LLs);
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
      spProj5m(SitePminus[s],psi[lex]);
      SiteChiP[s]=zero;
      SiteChiM[s]=zero;
    }
    int s=0;
    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
 	vbroadcast(BcastP,SitePplus [s2],l);
 	vbroadcast(BcastM,SitePminus[s2],l);
 	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
 	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
 	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
 	}
      s++;
    }}
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spRecon5p(SiteChi[s],SiteChiP[s]);
      accumRecon5m(SiteChi[s],SiteChiM[s]);
      chi[lex] = SiteChi[s]*0.5;
    }
  }
 }
 INSTANTIATE_DPERP(DomainWallVec5dImplD);
 INSTANTIATE_DPERP(DomainWallVec5dImplF);
 INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
 INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
 template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 }}
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/lib/qcd/action/fermion/DomainWallFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -1,532 +1,490 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+    Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
-/*  END LEGAL */
+#ifndef  GRID_QCD_FERMION_OPERATOR_IMPL_H
-#ifndef GRID_QCD_FERMION_OPERATOR_IMPL_H
+#define  GRID_QCD_FERMION_OPERATOR_IMPL_H
 #define GRID_QCD_FERMION_OPERATOR_IMPL_H
 namespace Grid {
-namespace QCD {
+
  namespace QCD {
-  //////////////////////////////////////////////
+    //////////////////////////////////////////////
-  // Template parameter class constructs to package
+    // Template parameter class constructs to package
-  // externally control Fermion implementations
+    // externally control Fermion implementations
-  // in orthogonal directions
+    // in orthogonal directions
-  //
+    //
-  // Ultimately need Impl to always define types where XXX is opaque
+    // Ultimately need Impl to always define types where XXX is opaque
-  //
+    //
-  //    typedef typename XXX               Simd;
+    //    typedef typename XXX               Simd;
-  //    typedef typename XXX     GaugeLinkField;	
+    //    typedef typename XXX     GaugeLinkField;	
-  //    typedef typename XXX         GaugeField;
+    //    typedef typename XXX         GaugeField;
-  //    typedef typename XXX      GaugeActField;
+    //    typedef typename XXX      GaugeActField;
-  //    typedef typename XXX       FermionField;
+    //    typedef typename XXX       FermionField;
-  //    typedef typename XXX  DoubledGaugeField;
+    //    typedef typename XXX  DoubledGaugeField;
-  //    typedef typename XXX         SiteSpinor;
+    //    typedef typename XXX         SiteSpinor;
-  //    typedef typename XXX     SiteHalfSpinor;	
+    //    typedef typename XXX     SiteHalfSpinor;	
-  //    typedef typename XXX         Compressor;	
+    //    typedef typename XXX         Compressor;	
-  //
+    //
-  // and Methods:
+    // and Methods:
-  //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-  //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-  //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
-  //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
-  //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
-  //
+    //
-  //
+    //
-  // To acquire the typedefs from "Base" (either a base class or template param) use:
+    // To acquire the typedefs from "Base" (either a base class or template param) use:
-  //
+    //
-  // INHERIT_GIMPL_TYPES(Base)
+    // INHERIT_GIMPL_TYPES(Base)
-  // INHERIT_FIMPL_TYPES(Base)
+    // INHERIT_FIMPL_TYPES(Base)
-  // INHERIT_IMPL_TYPES(Base)
+    // INHERIT_IMPL_TYPES(Base)
-  //
+    //
-  // The Fermion operators will do the following:
+    // The Fermion operators will do the following:
-  //
+    //
-  // struct MyOpParams { 
+    // struct MyOpParams { 
-  //   RealD mass;
+    //   RealD mass;
-  // };
+    // };
-  //
+    //
-  //
+    //
-  // template<class Impl>
+    // template<class Impl>
-  // class MyOp : public<Impl> { 
+    // class MyOp : pubic<Impl> { 
-  // public:
+    // public:
-  //
+    //
-  //    INHERIT_ALL_IMPL_TYPES(Impl);
+    //    INHERIT_ALL_IMPL_TYPES(Impl);
-  //
+    //
-  //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
+    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
-  //    {
+    //    {
-  //
+    //
-  //    };
+    //    };
-  //    
+    //    
-  //  }
+    //  }
-  //////////////////////////////////////////////
+    //////////////////////////////////////////////
-  
+
    ////////////////////////////////////////////////////////////////////////
    // Implementation dependent fermion types
    ////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////
  // Implementation dependent fermion types
  ////////////////////////////////////////////////////////////////////////
 #define INHERIT_FIMPL_TYPES(Impl)\
-  typedef typename Impl::FermionField           FermionField;		\
+    typedef typename Impl::FermionField           FermionField;		\
-  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
+    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
-  typedef typename Impl::SiteSpinor               SiteSpinor;		\
+    typedef typename Impl::SiteSpinor               SiteSpinor;		\
-  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
+    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
-  typedef typename Impl::Compressor               Compressor;		\
+    typedef typename Impl::Compressor               Compressor;		\
-  typedef typename Impl::StencilImpl             StencilImpl;		\
+    typedef typename Impl::StencilImpl              StencilImpl;	\
-  typedef typename Impl::ImplParams ImplParams;				\
+    typedef typename Impl::ImplParams ImplParams;
  typedef typename Impl::Coeff_t       Coeff_t;
 #define INHERIT_IMPL_TYPES(Base) \
  INHERIT_GIMPL_TYPES(Base)	 \
  INHERIT_FIMPL_TYPES(Base)
  /////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index
  /////////////////////////////////////////////////////////////////////////////
  template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
  class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
 #define INHERIT_IMPL_TYPES(Base) \
    INHERIT_GIMPL_TYPES(Base)\
    INHERIT_FIMPL_TYPES(Base)
    ///////
    // Single flavour four spinors with colour index
    ///////
    template<class S,int Nrepresentation=Nc>
    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
-    static const int Dimension = Representation::Dimension;
+      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
    //Necessary?
    constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
    const bool LsVectorised=false;
    typedef _Coeff_t Coeff_t;
-    INHERIT_GIMPL_TYPES(Gimpl);
+      INHERIT_GIMPL_TYPES(Gimpl);
-      
+
-    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-    template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
-    typedef iImplSpinor<Simd>            SiteSpinor;
+      typedef iImplSpinor    <Simd>           SiteSpinor;
-    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
-    typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
      typedef Lattice<SiteSpinor>                 FermionField;
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
      ImplParams Params;
      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
-    typedef Lattice<SiteSpinor>            FermionField;
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
-    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+        mult(&phi(),&U(mu),&chi());
    typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
    typedef WilsonImplParams ImplParams;
    typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
    ImplParams Params;
    WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
    bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
    inline void multLink(SiteHalfSpinor &phi,
 			 const SiteDoubledGaugeField &U,
 			 const SiteHalfSpinor &chi,
 			 int mu,
 			 StencilEntry *SE,
 			 StencilImpl &St) {
      mult(&phi(), &U(mu), &chi());
    }
    template <class ref>
    inline void loadLinkElement(Simd &reg, ref &memory) {
      reg = memory;
    }
    inline void DoubleStore(GridBase *GaugeGrid,
 			    DoubledGaugeField &Uds,
 			    const GaugeField &Umu) {
      conformable(Uds._grid, GaugeGrid);
      conformable(Umu._grid, GaugeGrid);
      GaugeLinkField U(GaugeGrid);
      for (int mu = 0; mu < Nd; mu++) {
 	U = PeekIndex<LorentzIndex>(Umu, mu);
 	PokeIndex<LorentzIndex>(Uds, U, mu);
 	U = adj(Cshift(U, mu, -1));
 	PokeIndex<LorentzIndex>(Uds, U, mu + 4);
      }
    }
-    inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+      template<class ref>
-      GaugeLinkField link(mat._grid);
+      inline void loadLinkElement(Simd & reg,ref &memory){
-      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+	reg = memory;
-      PokeIndex<LorentzIndex>(mat,link,mu);
+      }
-    }   
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-      
+      {
-    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+        conformable(Uds._grid,GaugeGrid);
-      
+        conformable(Umu._grid,GaugeGrid);
-      int Ls=Btilde._grid->_fdimensions[0];
+        GaugeLinkField U(GaugeGrid);
-      GaugeLinkField tmp(mat._grid);
+        for(int mu=0;mu<Nd;mu++){
-      tmp = zero;
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
-      
+	  PokeIndex<LorentzIndex>(Uds,U,mu);
-      PARALLEL_FOR_LOOP
+	  U = adj(Cshift(U,mu,-1));
-      for(int sss=0;sss<tmp._grid->oSites();sss++){
+	  PokeIndex<LorentzIndex>(Uds,U,mu+4);
 	int sU=sss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
 	}
      }
      PokeIndex<LorentzIndex>(mat,tmp,mu);
-    }
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
-  };
+	GaugeLinkField link(mat._grid);
 	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
 	PokeIndex<LorentzIndex>(mat,link,mu);
      }   
-  ////////////////////////////////////////////////////////////////////////////////////
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
  // Single flavour four spinors with colour index, 5d redblack
  ////////////////////////////////////////////////////////////////////////////////////
-template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
+	int Ls=Btilde._grid->_fdimensions[0];
 class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
  public:
  static const int Dimension = Nrepresentation;
  const bool LsVectorised=true;
  typedef _Coeff_t Coeff_t;      
  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
  INHERIT_GIMPL_TYPES(Gimpl);
  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
  typedef iImplSpinor<Simd> SiteSpinor;
  typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
  typedef Lattice<SiteSpinor> FermionField;
  // Make the doubled gauge field a *scalar*
  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
  typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
  typedef WilsonImplParams ImplParams;
  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
  ImplParams Params;
  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
  bool overlapCommsCompute(void) { return false; };
  template <class ref>
  inline void loadLinkElement(Simd &reg, ref &memory) {
    vsplat(reg, memory);
  }
-  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+	GaugeLinkField tmp(mat._grid);
-		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+	tmp = zero;
-		       StencilImpl &St) {
+PARALLEL_FOR_LOOP
-    SiteGaugeLink UU;
+	for(int sss=0;sss<tmp._grid->oSites();sss++){
-    for (int i = 0; i < Nrepresentation; i++) {
+	  int sU=sss;
-      for (int j = 0; j < Nrepresentation; j++) {
+	  for(int s=0;s<Ls;s++){
-	vsplat(UU()()(i, j), U(mu)()(i, j));
+	    int sF = s+Ls*sU;
 	    tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
 	  }
 	}
 	PokeIndex<LorentzIndex>(mat,tmp,mu);
      }
-    }
+
-    mult(&phi(), &UU(), &chi());
+    };
-  }
+
    ///////
    // Single flavour four spinors with colour index, 5d redblack
    ///////
    template<class S,int Nrepresentation=Nc>
    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
-  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-  {
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-    SiteScalarGaugeField ScalarUmu;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
-    SiteDoubledGaugeField ScalarUds;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
-    GaugeLinkField U(Umu._grid);
+      typedef iImplSpinor    <Simd>           SiteSpinor;
-    GaugeField Uadj(Umu._grid);
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
-    for (int mu = 0; mu < Nd; mu++) {
+      typedef Lattice<SiteSpinor>             FermionField;
-      U = PeekIndex<LorentzIndex>(Umu, mu);
+
-      U = adj(Cshift(U, mu, -1));
+      // Make the doubled gauge field a *scalar*
-      PokeIndex<LorentzIndex>(Uadj, U, mu);
+      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
-    }
+      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
      ImplParams Params;
      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      bool overlapCommsCompute(void) { return false; };
-    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      template<class ref>
-      std::vector<int> lcoor;
+      inline void loadLinkElement(Simd & reg,ref &memory){
-      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	vsplat(reg,memory);
-      
+      }
-      peekLocalSite(ScalarUmu, Umu, lcoor);
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
-      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      {
-      
+	SiteGaugeLink UU;
-      peekLocalSite(ScalarUmu, Uadj, lcoor);
+	for(int i=0;i<Nrepresentation;i++){
-      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+	  for(int j=0;j<Nrepresentation;j++){
-      
+	    vsplat(UU()()(i,j),U(mu)()(i,j));
-      pokeLocalSite(ScalarUds, Uds, lcoor);
+	  }
-    }
+	}
-  }
+        mult(&phi(),&UU(),&chi());
-      
+      }
-  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+
-  {
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    assert(0);
+      {
-  }
+	SiteScalarGaugeField  ScalarUmu;
-      
+	SiteDoubledGaugeField ScalarUds;
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
+
-  {
+        GaugeLinkField U   (Umu._grid);
 	GaugeField     Uadj(Umu._grid);
        for(int mu=0;mu<Nd;mu++){
  	  U = PeekIndex<LorentzIndex>(Umu,mu);
 	  U = adj(Cshift(U,mu,-1));
 	  PokeIndex<LorentzIndex>(Uadj,U,mu);
 	}
 	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
 	  std::vector<int> lcoor;
 	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
 	  peekLocalSite(ScalarUmu,Umu,lcoor);
 	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
 	  peekLocalSite(ScalarUmu,Uadj,lcoor);
 	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
 	  pokeLocalSite(ScalarUds,Uds,lcoor);
 	}
      }
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	assert(0);
-  }
+      }   
-};
+
-    
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
 	assert(0);
      }
    };
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
    template<class S,int Nrepresentation>
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
      template<typename vtype> using iImplSpinor             = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp >;
      template<typename vtype> using iImplHalfSpinor         = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp >;
      template<typename vtype> using iImplDoubledGaugeField  = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >, Ngp >;
-template <class S, int Nrepresentation,class _Coeff_t = RealD>
+      typedef iImplSpinor    <Simd>           SiteSpinor;
-class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
- public:
+      typedef iImplDoubledGaugeField<Simd>    SiteDoubledGaugeField;
- static const int Dimension = Nrepresentation;
+      typedef Lattice<SiteSpinor>                 FermionField;
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
- const bool LsVectorised=false;
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
- typedef _Coeff_t Coeff_t;
+      typedef GparityWilsonImplParams ImplParams;
- typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+
- 
+      ImplParams Params;
- INHERIT_GIMPL_TYPES(Gimpl);
+
      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
- template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
- template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
+
- template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
 	typedef SiteHalfSpinor vobj;
 	typedef typename SiteHalfSpinor::scalar_object sobj;
 	vobj vtmp;
 	sobj stmp;
 	GridBase *grid = St._grid;
- typedef iImplSpinor<Simd> SiteSpinor;
+	const int Nsimd = grid->Nsimd();
- typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+	
- typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+	int direction    = St._directions[mu];
- 
+	int distance     = St._distances[mu];
- typedef Lattice<SiteSpinor> FermionField;
+	int ptype        = St._permute_type[mu]; 
- typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+	int sl           = St._grid->_simd_layout[direction];
- 
+
- typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+	// Fixme X.Y.Z.T hardcode in stencil
- typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+	int mmu          = mu % Nd;
- 
+
- typedef GparityWilsonImplParams ImplParams;
+	// assert our assumptions
 	assert((distance==1)||(distance==-1)); // nearest neighbour stencil hard code
 	assert((sl==1)||(sl==2));
 	std::vector<int> icoor;
- ImplParams Params;
+	if ( SE->_around_the_world && Params.twists[mmu] ) {
- GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+	  if ( sl == 2 ) {
- bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
+	    std::vector<sobj> vals(Nsimd);
- // provide the multiply by link that is differentiated between Gparity (with
+	    extract(chi,vals);
- // flavour index) and non-Gparity
+	    for(int s=0;s<Nsimd;s++){
 inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
 		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 		      StencilImpl &St) {
-  typedef SiteHalfSpinor vobj;
+	      grid->iCoorFromIindex(icoor,s);
   typedef typename SiteHalfSpinor::scalar_object sobj;
   vobj vtmp;
   sobj stmp;
   GridBase *grid = St._grid;
   const int Nsimd = grid->Nsimd();
   int direction = St._directions[mu];
   int distance = St._distances[mu];
   int ptype = St._permute_type[mu];
   int sl = St._grid->_simd_layout[direction];
   // Fixme X.Y.Z.T hardcode in stencil
   int mmu = mu % Nd;
   // assert our assumptions
   assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
   assert((sl == 1) || (sl == 2));
   std::vector<int> icoor;
   if ( SE->_around_the_world && Params.twists[mmu] ) {
     if ( sl == 2 ) {
       std::vector<sobj> vals(Nsimd);
       extract(chi,vals);
       for(int s=0;s<Nsimd;s++){
 	 grid->iCoorFromIindex(icoor,s);
-	 assert((icoor[direction]==0)||(icoor[direction]==1));
+	      assert((icoor[direction]==0)||(icoor[direction]==1));
-	 int permute_lane;
+	      int permute_lane;
-	 if ( distance == 1) {
+	      if ( distance == 1) {
-	   permute_lane = icoor[direction]?1:0;
+		permute_lane = icoor[direction]?1:0;
-	 } else {
+	      } else {
-	   permute_lane = icoor[direction]?0:1;
+		permute_lane = icoor[direction]?0:1;
 	 }
 	 if ( permute_lane ) { 
 	   stmp(0) = vals[s](1);
 	   stmp(1) = vals[s](0);
 	   vals[s] = stmp;
 	      }
-       }
+	      
-       merge(vtmp,vals);
+	      if ( permute_lane ) { 
-	    
+		stmp(0) = vals[s](1);
-     } else { 
+		stmp(1) = vals[s](0);
-       vtmp(0) = chi(1);
+		vals[s] = stmp;
-       vtmp(1) = chi(0);
+	      }
-     }
+	    }
-     mult(&phi(0),&U(0)(mu),&vtmp(0));
+	    merge(vtmp,vals);
     mult(&phi(1),&U(1)(mu),&vtmp(1));
   } else { 
     mult(&phi(0),&U(0)(mu),&chi(0));
     mult(&phi(1),&U(1)(mu),&chi(1));
   }
 }
- inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+	  } else { 
- {
+	    vtmp(0) = chi(1);
-   conformable(Uds._grid,GaugeGrid);
+	    vtmp(1) = chi(0);
-   conformable(Umu._grid,GaugeGrid);
+	  }
-   
+	  mult(&phi(0),&U(0)(mu),&vtmp(0));
-   GaugeLinkField Utmp (GaugeGrid);
+	  mult(&phi(1),&U(1)(mu),&vtmp(1));
-   GaugeLinkField U    (GaugeGrid);
+	  
-   GaugeLinkField Uconj(GaugeGrid);
+	} else { 
-   
+	  mult(&phi(0),&U(0)(mu),&chi(0));
-   Lattice<iScalar<vInteger> > coor(GaugeGrid);
+	  mult(&phi(1),&U(1)(mu),&chi(1));
 	}
-   for(int mu=0;mu<Nd;mu++){
+      }
     LatticeCoordinate(coor,mu);
     U     = PeekIndex<LorentzIndex>(Umu,mu);
     Uconj = conjugate(U);
     // This phase could come from a simple bc 1,1,-1,1 ..
     int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
     if ( Params.twists[mu] ) { 
       Uconj = where(coor==neglink,-Uconj,Uconj);
     }
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu) = U[ss]();
       Uds[ss](1)(mu) = Uconj[ss]();
     }
     U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
     Uconj = adj(Cshift(Uconj,mu,-1));
     Utmp = U;
     if ( Params.twists[mu] ) { 
       Utmp = where(coor==0,Uconj,Utmp);
     }
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu+4) = Utmp[ss]();
     }
     Utmp = Uconj;
     if ( Params.twists[mu] ) { 
       Utmp = where(coor==0,U,Utmp);
     }
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](1)(mu+4) = Utmp[ss]();
     }
   }
 }
 inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
-   // DhopDir provides U or Uconj depending on coor/flavour.
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-   GaugeLinkField link(mat._grid);
+      {
   // use lorentz for flavour as hack.
   auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
 PARALLEL_FOR_LOOP
   for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
     link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
   }
   PokeIndex<LorentzIndex>(mat, link, mu);
   return;
 }
 inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
   int Ls = Btilde._grid->_fdimensions[0];
-   GaugeLinkField tmp(mat._grid);
+	conformable(Uds._grid,GaugeGrid);
-   tmp = zero;
+	conformable(Umu._grid,GaugeGrid);
 	GaugeLinkField Utmp (GaugeGrid);
 	GaugeLinkField U    (GaugeGrid);
 	GaugeLinkField Uconj(GaugeGrid);
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
 	for(int mu=0;mu<Nd;mu++){
 	  LatticeCoordinate(coor,mu);
 	  U     = PeekIndex<LorentzIndex>(Umu,mu);
 	  Uconj = conjugate(U);
 	  // This phase could come from a simple bc 1,1,-1,1 ..
 	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
 	  if ( Params.twists[mu] ) { 
 	    Uconj = where(coor==neglink,-Uconj,Uconj);
 	  }
 PARALLEL_FOR_LOOP
-   for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
+	  for(auto ss=U.begin();ss<U.end();ss++){
-     for (int s = 0; s < Ls; s++) {
+	    Uds[ss](0)(mu) = U[ss]();
-       int sF = s + Ls * ss;
+	    Uds[ss](1)(mu) = Uconj[ss]();
-       auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
+	  }
-       tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+	  
-     }
+	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
-   }
+	  Uconj = adj(Cshift(Uconj,mu,-1));
-   PokeIndex<LorentzIndex>(mat, tmp, mu);
+	  
-   return;
+	  Utmp = U;
- }
+	  if ( Params.twists[mu] ) { 
 	    Utmp = where(coor==0,Uconj,Utmp);
 	  }
 PARALLEL_FOR_LOOP
 	  for(auto ss=U.begin();ss<U.end();ss++){
 	    Uds[ss](0)(mu+4) = Utmp[ss]();
 	  }
 	  Utmp = Uconj;
 	  if ( Params.twists[mu] ) { 
 	    Utmp = where(coor==0,U,Utmp);
 	  }
 PARALLEL_FOR_LOOP
 	  for(auto ss=U.begin();ss<U.end();ss++){
 	    Uds[ss](1)(mu+4) = Utmp[ss]();
 	  }
 	}
      }
-};
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	// DhopDir provides U or Uconj depending on coor/flavour.
 	GaugeLinkField link(mat._grid);
 	// use lorentz for flavour as hack.
 	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  
 PARALLEL_FOR_LOOP
        for(auto ss=tmp.begin();ss<tmp.end();ss++){
 	  link[ss]() = tmp[ss](0,0) - conjugate(tmp[ss](1,1)) ;
 	}
 	PokeIndex<LorentzIndex>(mat,link,mu);
 	return;
      }
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
- typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
+	int Ls=Btilde._grid->_fdimensions[0];
 typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double
- typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
+	GaugeLinkField tmp(mat._grid);
- typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
+	tmp = zero;
- typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
+PARALLEL_FOR_LOOP
- 
+	for(int ss=0;ss<tmp._grid->oSites();ss++){
- typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
+	  for(int s=0;s<Ls;s++){
- typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
+	    int sF = s+Ls*ss;
- typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
+	    auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF]));
- 
+	    tmp[ss]() = tmp[ss]()+ ttmp(0,0) + conjugate(ttmp(1,1));
- typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
+	  }
- typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
+	}
- typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
+	PokeIndex<LorentzIndex>(mat,tmp,mu);
- 
+	return;
- typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+      }
- typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    };
 typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
 typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
 typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
 typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
 typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec
 typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
 typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
-}}
+    typedef WilsonImpl<vComplex ,Nc> WilsonImplR; // Real.. whichever prec
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
  }
 }
 #endif
--- a/lib/qcd/action/fermion/MobiusFermion.h
+++ b/lib/qcd/action/fermion/MobiusFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_FERMION_H
 #define  GRID_QCD_MOBIUS_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/MobiusZolotarevFermion.h
+++ b/lib/qcd/action/fermion/MobiusZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
 #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
 #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
+++ b/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
 #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SCALED_SHAMIR_FERMION_H
 #define  GRID_QCD_SCALED_SHAMIR_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/ShamirZolotarevFermion.h
+++ b/lib/qcd/action/fermion/ShamirZolotarevFermion.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
 #define  GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@ -1,315 +1,319 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
+    Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
 /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2,
+  const std::vector<int> WilsonFermionStatic::directions   ({0,1,2,3, 0, 1, 2, 3});
-                                                        3});
+  const std::vector<int> WilsonFermionStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1,
+  int WilsonFermionStatic::HandOptDslash;
                                                           -1, -1});
 int WilsonFermionStatic::HandOptDslash;
-/////////////////////////////////
+  /////////////////////////////////
-// Constructor and gauge import
+  // Constructor and gauge import
-/////////////////////////////////
+  /////////////////////////////////
-template <class Impl>
+  template<class Impl>
-WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+  WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu,
-                                   GridRedBlackCartesian &Hgrid, RealD _mass,
+				     GridCartesian         &Fgrid,
-                                   const ImplParams &p)
+				     GridRedBlackCartesian &Hgrid, 
-    : Kernels(p),
+				     RealD _mass,const ImplParams &p) :
-      _grid(&Fgrid),
+        Kernels(p),
-      _cbgrid(&Hgrid),
+        _grid(&Fgrid),
-      Stencil(&Fgrid, npoint, Even, directions, displacements),
+	_cbgrid(&Hgrid),
-      StencilEven(&Hgrid, npoint, Even, directions,
+	Stencil    (&Fgrid,npoint,Even,directions,displacements),
-                  displacements),  // source is Even
+	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
-      StencilOdd(&Hgrid, npoint, Odd, directions,
+	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
-                 displacements),  // source is Odd
+	mass(_mass),
-      mass(_mass),
+	Lebesgue(_grid),
-      Lebesgue(_grid),
+	LebesgueEvenOdd(_cbgrid),
-      LebesgueEvenOdd(_cbgrid),
+	Umu(&Fgrid),
-      Umu(&Fgrid),
+	UmuEven(&Hgrid),
-      UmuEven(&Hgrid),
+	UmuOdd (&Hgrid) 
-      UmuOdd(&Hgrid) {
+  {
-  // Allocate the required comms buffer
+    // Allocate the required comms buffer
-  ImportGauge(_Umu);
+    ImportGauge(_Umu);
 }
 template <class Impl>
 void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
  GaugeField HUmu(_Umu._grid);
  HUmu = _Umu * (-0.5);
  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
  pickCheckerboard(Even, UmuEven, Umu);
  pickCheckerboard(Odd, UmuOdd, Umu);
 }
 /////////////////////////////
 // Implement the interface
 /////////////////////////////
 template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerNo);
  return axpy_norm(out, 4 + mass, in, out);
 }
 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  Dhop(in, out, DaggerYes);
  return axpy_norm(out, 4 + mass, in, out);
 }
 template <class Impl>
 void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
  if (in.checkerboard == Odd) {
    DhopEO(in, out, DaggerNo);
  } else {
    DhopOE(in, out, DaggerNo);
  }
-}
+
-template <class Impl>
+  template<class Impl>
-void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
+  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
-  if (in.checkerboard == Odd) {
+  {
-    DhopEO(in, out, DaggerYes);
+    GaugeField HUmu(_Umu._grid);
-  } else {
+    HUmu = _Umu*(-0.5);
-    DhopOE(in, out, DaggerYes);
+    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    pickCheckerboard(Even,UmuEven,Umu);
    pickCheckerboard(Odd ,UmuOdd,Umu);
  }
-}
+  
-
+  /////////////////////////////
-template <class Impl>
+  // Implement the interface
-void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
+  /////////////////////////////
-  out.checkerboard = in.checkerboard;
+      
-  typename FermionField::scalar_type scal(4.0 + mass);
+  template<class Impl>
-  out = scal * in;
+  RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) 
-}
+  {
-
+    out.checkerboard=in.checkerboard;
-template <class Impl>
+    Dhop(in,out,DaggerNo);
-void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
+    return axpy_norm(out,4+mass,in,out);
  out.checkerboard = in.checkerboard;
  Mooee(in, out);
 }
 template <class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
  out.checkerboard = in.checkerboard;
  out = (1.0 / (4.0 + mass)) * in;
 }
 template <class Impl>
 void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in,
                                      FermionField &out) {
  out.checkerboard = in.checkerboard;
  MooeeInv(in, out);
 }
 ///////////////////////////////////
 // Internal
 ///////////////////////////////////
 template <class Impl>
 void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
                                        GaugeField &mat, const FermionField &A,
                                        const FermionField &B, int dag) {
  assert((dag == DaggerNo) || (dag == DaggerYes));
  Compressor compressor(dag);
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);
  Atilde = A;
  st.HaloExchange(B, compressor);
  for (int mu = 0; mu < Nd; mu++) {
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma (1+g)<->(1-g) if dag
    ////////////////////////////////////////////////////////////////////////
    int gamma = mu;
    if (!dag) gamma += Nd;
    ////////////////////////
    // Call the single hop
    ////////////////////////
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < B._grid->oSites(); sss++) {
      Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
                               gamma);
    }
    //////////////////////////////////////////////////
    // spin trace outer product
    //////////////////////////////////////////////////
    Impl::InsertForce4D(mat, Btilde, Atilde, mu);
  }
 }
-template <class Impl>
+  template<class Impl>
-void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U,
+  RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) 
-                                    const FermionField &V, int dag) {
+  {
-  conformable(U._grid, _grid);
+    out.checkerboard=in.checkerboard;
-  conformable(U._grid, V._grid);
+    Dhop(in,out,DaggerYes);
-  conformable(U._grid, mat._grid);
+    return axpy_norm(out,4+mass,in,out);
  mat.checkerboard = U.checkerboard;
  DerivInternal(Stencil, Umu, mat, U, V, dag);
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U,
                                      const FermionField &V, int dag) {
  conformable(U._grid, _cbgrid);
  conformable(U._grid, V._grid);
  conformable(U._grid, mat._grid);
  assert(V.checkerboard == Even);
  assert(U.checkerboard == Odd);
  mat.checkerboard = Odd;
  DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U,
                                      const FermionField &V, int dag) {
  conformable(U._grid, _cbgrid);
  conformable(U._grid, V._grid);
  conformable(U._grid, mat._grid);
  assert(V.checkerboard == Odd);
  assert(U.checkerboard == Even);
  mat.checkerboard = Even;
  DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
 }
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,
                               int dag) {
  conformable(in._grid, _grid);  // verifies full grid
  conformable(in._grid, out._grid);
  out.checkerboard = in.checkerboard;
  DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,
                                 int dag) {
  conformable(in._grid, _cbgrid);    // verifies half grid
  conformable(in._grid, out._grid);  // drops the cb check
  assert(in.checkerboard == Even);
  out.checkerboard = Odd;
  DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,
                                 int dag) {
  conformable(in._grid, _cbgrid);    // verifies half grid
  conformable(in._grid, out._grid);  // drops the cb check
  assert(in.checkerboard == Odd);
  out.checkerboard = Even;
  DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
 }
 template <class Impl>
 void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out,
                               int dir, int disp) {
  DhopDir(in, out, dir, disp);
 }
 template <class Impl>
 void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,
                                  int dir, int disp) {
  int skip = (disp == 1) ? 0 : 1;
  int dirdisp = dir + skip * 4;
  int gamma = dir + (1 - skip) * 4;
  DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
 };
 template <class Impl>
 void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
                                      int dirdisp, int gamma, int dag) {
  Compressor compressor(dag);
  Stencil.HaloExchange(in, compressor);
  PARALLEL_FOR_LOOP
  for (int sss = 0; sss < in._grid->oSites(); sss++) {
    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
                             dirdisp, gamma);
  }
 };
-template <class Impl>
+  template<class Impl>
-void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
+  void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) 
-                                       DoubledGaugeField &U,
+  {
-                                       const FermionField &in,
+    if ( in.checkerboard == Odd ) {
-                                       FermionField &out, int dag) {
+      DhopEO(in,out,DaggerNo);
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+    } else {
-
+      DhopOE(in,out,DaggerNo);
-  Compressor compressor(dag);
+    }
-  st.HaloExchange(in, compressor);
+  }
-
+  template<class Impl>
-  if (dag == DaggerYes) {
+  void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) 
-    PARALLEL_FOR_LOOP
+  {
-    for (int sss = 0; sss < in._grid->oSites(); sss++) {
+    if ( in.checkerboard == Odd ) {
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
+      DhopEO(in,out,DaggerYes);
-                                   out);
+    } else {
-    }
+      DhopOE(in,out,DaggerYes);
  } else {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
      Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                out);
    }
  }
 };
-FermOpTemplateInstantiate(WilsonFermion);
+  template<class Impl>
-AdjointFermOpTemplateInstantiate(WilsonFermion);
+  void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
-TwoIndexFermOpTemplateInstantiate(WilsonFermion);
+    out.checkerboard = in.checkerboard;
-GparityFermOpTemplateInstantiate(WilsonFermion);
+    typename FermionField::scalar_type scal(4.0+mass);
-}
+    out = scal*in;
-}
+  }
  template<class Impl>
  void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    Mooee(in,out);
  }
  template<class Impl>
  void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    out = (1.0/(4.0+mass))*in;
  }
  template<class Impl>
  void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
    MooeeInv(in,out);
  }
  ///////////////////////////////////
  // Internal
  ///////////////////////////////////
  template<class Impl>
  void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
 					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
 					  const FermionField &B,int dag) {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    FermionField Btilde(B._grid);
    FermionField Atilde(B._grid);
    Atilde = A;
    st.HaloExchange(B,compressor);
    for(int mu=0;mu<Nd;mu++){
      ////////////////////////////////////////////////////////////////////////
      // Flip gamma (1+g)<->(1-g) if dag
      ////////////////////////////////////////////////////////////////////////
      int gamma = mu;
      if ( !dag ) gamma+= Nd;
      ////////////////////////
      // Call the single hop
      ////////////////////////
 PARALLEL_FOR_LOOP
 	for(int sss=0;sss<B._grid->oSites();sss++){
 	  Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
 	}
      //////////////////////////////////////////////////
      // spin trace outer product
      //////////////////////////////////////////////////
      Impl::InsertForce4D(mat,Btilde,Atilde,mu);
    }
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    conformable(U._grid,_grid);  
    conformable(U._grid,V._grid);
    conformable(U._grid,mat._grid);
    mat.checkerboard = U.checkerboard;
    DerivInternal(Stencil,Umu,mat,U,V,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    conformable(U._grid,_cbgrid);  
    conformable(U._grid,V._grid);
    conformable(U._grid,mat._grid);
    assert(V.checkerboard==Even);
    assert(U.checkerboard==Odd);
    mat.checkerboard = Odd;
    DerivInternal(StencilEven,UmuOdd,mat,U,V,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
  {
    conformable(U._grid,_cbgrid);  
    conformable(U._grid,V._grid);
    conformable(U._grid,mat._grid);
    assert(V.checkerboard==Odd);
    assert(U.checkerboard==Even);
    mat.checkerboard = Even;
    DerivInternal(StencilOdd,UmuEven,mat,U,V,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) {
    conformable(in._grid,_grid); // verifies full grid
    conformable(in._grid,out._grid);
    out.checkerboard = in.checkerboard;
    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) {
    conformable(in._grid,_cbgrid);    // verifies half grid
    conformable(in._grid,out._grid); // drops the cb check
    assert(in.checkerboard==Even);
    out.checkerboard = Odd;
    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
    conformable(in._grid,_cbgrid);    // verifies half grid
    conformable(in._grid,out._grid); // drops the cb check
    assert(in.checkerboard==Odd);
    out.checkerboard = Even;
    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::Mdir (const FermionField &in, FermionField &out,int dir,int disp) {
    DhopDir(in,out,dir,disp);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir,int disp){
    int skip = (disp==1) ? 0 : 1;
    int dirdisp  = dir+skip*4;
    int gamma    = dir+(1-skip)*4;
    DhopDirDisp(in,out,dirdisp,gamma,DaggerNo);
  };
  template<class Impl>
  void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) {
    Compressor compressor(dag);
    Stencil.HaloExchange(in,compressor);
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
 	Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
      }
  };
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    st.HaloExchange(in,compressor);
    if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
 	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    } else {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
 	Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    }
  };
  FermOpTemplateInstantiate(WilsonFermion);
  GparityFermOpTemplateInstantiate(WilsonFermion);
 }}
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@ -1,155 +1,161 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/qcd/action/fermion/WilsonFermion.h
+    Source file: ./lib/qcd/action/fermion/WilsonFermion.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
-/*  END LEGAL */
+#ifndef  GRID_QCD_WILSON_FERMION_H
-#ifndef GRID_QCD_WILSON_FERMION_H
+#define  GRID_QCD_WILSON_FERMION_H
 #define GRID_QCD_WILSON_FERMION_H
 namespace Grid {
-namespace QCD {
+  namespace QCD {
-class WilsonFermionStatic {
+    class WilsonFermionStatic {
- public:
+    public:
-  static int HandOptDslash;  // these are a temporary hack
+      static int HandOptDslash; // these are a temporary hack
-  static int MortonOrder;
+      static int MortonOrder;
-  static const std::vector<int> directions;
+      static const std::vector<int> directions   ;
-  static const std::vector<int> displacements;
+      static const std::vector<int> displacements;
-  static const int npoint = 8;
+      static const int npoint=8;
-};
+    };
-template <class Impl>
+    template<class Impl>
-class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
+    class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic
- public:
+    {
-  INHERIT_IMPL_TYPES(Impl);
+    public:
-  typedef WilsonKernels<Impl> Kernels;
+    INHERIT_IMPL_TYPES(Impl);
    typedef WilsonKernels<Impl> Kernels;
-  ///////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////
-  // Implement the abstract base
+      // Implement the abstract base
-  ///////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////
-  GridBase *GaugeGrid(void) { return _grid; }
+      GridBase *GaugeGrid(void)              { return _grid ;}
-  GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
+      GridBase *GaugeRedBlackGrid(void)      { return _cbgrid ;}
-  GridBase *FermionGrid(void) { return _grid; }
+      GridBase *FermionGrid(void)            { return _grid;}
-  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
+      GridBase *FermionRedBlackGrid(void)    { return _cbgrid;}
-  //////////////////////////////////////////////////////////////////
+      //////////////////////////////////////////////////////////////////
-  // override multiply; cut number routines if pass dagger argument
+      // override multiply; cut number routines if pass dagger argument
-  // and also make interface more uniformly consistent
+      // and also make interface more uniformly consistent
-  //////////////////////////////////////////////////////////////////
+      //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
+      RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+      RealD Mdag(const FermionField &in, FermionField &out);
-  /////////////////////////////////////////////////////////
+      /////////////////////////////////////////////////////////
-  // half checkerboard operations
+      // half checkerboard operations
-  // could remain virtual so we  can derive Clover from Wilson base
+      // could remain virtual so we  can derive Clover from Wilson base
-  /////////////////////////////////////////////////////////
+      /////////////////////////////////////////////////////////
-  void Meooe(const FermionField &in, FermionField &out);
+      void Meooe(const FermionField &in, FermionField &out) ;
-  void MeooeDag(const FermionField &in, FermionField &out);
+      void MeooeDag(const FermionField &in, FermionField &out) ;
-  // allow override for twisted mass and clover
+      // allow override for twisted mass and clover
-  virtual void Mooee(const FermionField &in, FermionField &out);
+      virtual void Mooee(const FermionField &in, FermionField &out) ;
-  virtual void MooeeDag(const FermionField &in, FermionField &out);
+      virtual void MooeeDag(const FermionField &in, FermionField &out) ;
-  virtual void MooeeInv(const FermionField &in, FermionField &out);
+      virtual void MooeeInv(const FermionField &in, FermionField &out) ;
-  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
+      virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
-  ////////////////////////
+      ////////////////////////
-  // Derivative interface
+      // Derivative interface
-  ////////////////////////
+      ////////////////////////
-  // Interface calls an internal routine
+      // Interface calls an internal routine
-  void DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V,
+      void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-                 int dag);
+      void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-  void DhopDerivOE(GaugeField &mat, const FermionField &U,
+      void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
                   const FermionField &V, int dag);
  void DhopDerivEO(GaugeField &mat, const FermionField &U,
                   const FermionField &V, int dag);
  ///////////////////////////////////////////////////////////////
  // non-hermitian hopping term; half cb or both
  ///////////////////////////////////////////////////////////////
  void Dhop(const FermionField &in, FermionField &out, int dag);
  void DhopOE(const FermionField &in, FermionField &out, int dag);
  void DhopEO(const FermionField &in, FermionField &out, int dag);
  ///////////////////////////////////////////////////////////////
  // Multigrid assistance; force term uses too
  ///////////////////////////////////////////////////////////////
  void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
  void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
  void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
                   int gamma, int dag);
  ///////////////////////////////////////////////////////////////
  // Extra methods added by derived
  ///////////////////////////////////////////////////////////////
  void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
                     const FermionField &A, const FermionField &B, int dag);
  void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                    const FermionField &in, FermionField &out, int dag);
  // Constructor
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                GridRedBlackCartesian &Hgrid, RealD _mass,
                const ImplParams &p = ImplParams());
  // DoubleStore impl dependent
  void ImportGauge(const GaugeField &_Umu);
  ///////////////////////////////////////////////////////////////
  // Data members require to support the functionality
  ///////////////////////////////////////////////////////////////
  //    protected:
 public:
  RealD mass;
  GridBase *_grid;
  GridBase *_cbgrid;
  // Defines the stencils for even and odd
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
 };
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 typedef WilsonFermion<WilsonImplD> WilsonFermionD;
-}
+      ///////////////////////////////////////////////////////////////
      // non-hermitian hopping term; half cb or both
      ///////////////////////////////////////////////////////////////
      void Dhop(const FermionField &in, FermionField &out,int dag) ;
      void DhopOE(const FermionField &in, FermionField &out,int dag) ;
      void DhopEO(const FermionField &in, FermionField &out,int dag) ;
      ///////////////////////////////////////////////////////////////
      // Multigrid assistance; force term uses too
      ///////////////////////////////////////////////////////////////
      void Mdir (const FermionField &in, FermionField &out,int dir,int disp) ;
      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
      void DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) ;
      ///////////////////////////////////////////////////////////////
      // Extra methods added by derived
      ///////////////////////////////////////////////////////////////
      void DerivInternal(StencilImpl & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);
      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
      // Constructor
      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
 		    GridRedBlackCartesian &Hgrid, 
 		    RealD _mass,
 		    const ImplParams &p= ImplParams()
 		    ) ;
      // DoubleStore impl dependent
      void ImportGauge(const GaugeField &_Umu);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
      //    protected:
    public:
      RealD                        mass;
      GridBase                     *    _grid; 
      GridBase                     *  _cbgrid;
      //Defines the stencils for even and odd
      StencilImpl Stencil; 
      StencilImpl StencilEven; 
      StencilImpl StencilOdd; 
      // Copy of the gauge field , with even and odd subsets
      DoubledGaugeField Umu;
      DoubledGaugeField UmuEven;
      DoubledGaugeField UmuOdd;
      LebesgueOrder Lebesgue;
      LebesgueOrder LebesgueEvenOdd;
    };
    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
    typedef WilsonFermion<WilsonImplD> WilsonFermionD;
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -42,15 +42,15 @@ const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1
  // 5d lattice for DWF.
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
-               GridCartesian         &FiveDimGrid,
+				       GridCartesian         &FiveDimGrid,
-               GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-               GridCartesian         &FourDimGrid,
+				       GridCartesian         &FourDimGrid,
-               GridRedBlackCartesian &FourDimRedBlackGrid,
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
-               RealD _M5,const ImplParams &p) :
+				       RealD _M5,const ImplParams &p) :
  Kernels(p),
-  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimGrid(&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
-  _FourDimGrid        (&FourDimGrid),
+  _FourDimGrid(&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
@ -62,83 +62,60 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
-  if (Impl::LsVectorised) { 
+  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._checker_dim==1);
-    int nsimd = Simd::Nsimd();
+  // Dimension zero of the five-d is the Ls direction
-    
+  Ls=FiveDimGrid._fdimensions[0];
-    // some assertions
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
-    assert(FiveDimGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
-    assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._simd_layout[0]==1);
-    assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+  assert(FiveDimGrid._processors[0]         ==1);
-    assert(FourDimGrid._ndimension==4);
+  assert(FiveDimGrid._simd_layout[0]        ==1);
-    // Dimension zero of the five-d is the Ls direction
+  // Other dimensions must match the decomposition of the four-D fields 
-    Ls=FiveDimGrid._fdimensions[0];
+  for(int d=0;d<4;d++){
-    assert(FiveDimGrid._processors[0]         ==1);
+    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
-    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
-    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
-    assert(FiveDimRedBlackGrid._processors[0] ==1);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
-    // Other dimensions must match the decomposition of the four-D fields 
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
-    for(int d=0;d<4;d++){
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
      assert(FourDimGrid._simd_layout[d]=1);
      assert(FourDimRedBlackGrid._simd_layout[d]=1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
-      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
-      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
-      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
    }
  } else {
    // some assertions
    assert(FiveDimGrid._ndimension==5);
    assert(FourDimGrid._ndimension==4);
    assert(FiveDimRedBlackGrid._ndimension==5);
    assert(FourDimRedBlackGrid._ndimension==4);
    assert(FiveDimRedBlackGrid._checker_dim==1);
    // Dimension zero of the five-d is the Ls direction
    Ls=FiveDimGrid._fdimensions[0];
    assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
    assert(FiveDimRedBlackGrid._processors[0] ==1);
    assert(FiveDimRedBlackGrid._simd_layout[0]==1);
    assert(FiveDimGrid._processors[0]         ==1);
    assert(FiveDimGrid._simd_layout[0]        ==1);
    // Other dimensions must match the decomposition of the four-D fields 
    for(int d=0;d<4;d++){
      assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
      assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
      assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
      assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
      assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
      assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
      assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
      assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
    }
  }
-    
+
  // Allocate the required comms buffer
  ImportGauge(_Umu);
-}
+}  
-  /*
+
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
-               GridCartesian         &FiveDimGrid,
+				       GridCartesian         &FiveDimGrid,
-               GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-               GridCartesian         &FourDimGrid,
+				       GridCartesian         &FourDimGrid,
-               RealD _M5,const ImplParams &p) :
+				       RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimGrid),
  UmuOdd (_FourDimGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();
@ -171,68 +148,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
  }
  {
    GaugeField HUmu(_Umu._grid);
    HUmu = _Umu*(-0.5);
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    UmuEven=Umu;// Really want a reference.
    UmuOdd =Umu;
  }
 }  
  */
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
    std::vector<int> latt = GridDefaultLatt();          
    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
    RealD NP = _FourDimGrid->_Nprocessors;
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " << DhopCommTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " << DhopComputeTime << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
   }
  if ( DerivCalls > 0 ) {
    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
  }
  if (DerivCalls > 0 || DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
  }
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::ZeroCounters(void) {
  DhopCalls       = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  DerivCalls       = 0;
  DerivCommTime    = 0;
  DerivComputeTime = 0;
  DerivDhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 template<class Impl>
@ -268,20 +190,19 @@ PARALLEL_FOR_LOOP
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
+      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
    }
  }
 };
 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
-            DoubledGaugeField & U,
+					  DoubledGaugeField & U,
-            GaugeField &mat,
+					  GaugeField &mat,
-            const FermionField &A,
+					  const FermionField &A,
-            const FermionField &B,
+					  const FermionField &B,
-            int dag)
+					  int dag)
 {
  DerivCalls++;
  assert((dag==DaggerNo) ||(dag==DaggerYes));
  conformable(st._grid,A._grid);
@ -292,52 +213,51 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);
  DerivCommTime-=usecond();
  st.HaloExchange(B,compressor);
  DerivCommTime+=usecond();
  Atilde=A;
-  DerivComputeTime-=usecond();
+  for(int mu=0;mu<Nd;mu++){
-  for (int mu = 0; mu < Nd; mu++) {
+      
    ////////////////////////////////////////////////////////////////////////
    // Flip gamma if dag
    ////////////////////////////////////////////////////////////////////////
    int gamma = mu;
-    if (!dag) gamma += Nd;
+    if ( !dag ) gamma+= Nd;
    ////////////////////////
    // Call the single hop
    ////////////////////////
-    DerivDhopComputeTime -= usecond();
+PARALLEL_FOR_LOOP
-    PARALLEL_FOR_LOOP
+    for(int sss=0;sss<U._grid->oSites();sss++){
-    for (int sss = 0; sss < U._grid->oSites(); sss++) {
+      for(int s=0;s<Ls;s++){
-      for (int s = 0; s < Ls; s++) {
+	int sU=sss;
-        int sU = sss;
+	int sF = s+Ls*sU;
        int sF = s + Ls * sU;
-        assert(sF < B._grid->oSites());
+	assert ( sF< B._grid->oSites());
-        assert(sU < U._grid->oSites());
+	assert ( sU< U._grid->oSites());
-        Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
+	Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
    ////////////////////////////
    // spin trace outer product
    ////////////////////////////
        ////////////////////////////
        // spin trace outer product
        ////////////////////////////
      }
    }
-    DerivDhopComputeTime += usecond();
+
-    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
+    Impl::InsertForce5D(mat,Btilde,Atilde,mu);
  }
  DerivComputeTime += usecond();
 }
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
+void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
-				      const FermionField &A,
+					    const FermionField &A,
-				      const FermionField &B,
+					    const FermionField &B,
-				      int dag)
+					    int dag)
 {
  conformable(A._grid,FermionGrid());  
  conformable(A._grid,B._grid);
@ -368,9 +288,9 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-					const FermionField &A,
+				  const FermionField &A,
-					const FermionField &B,
+				  const FermionField &B,
-					int dag)
+				  int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@ -393,56 +313,30 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
  int LLs = in._grid->_rdimensions[0];
  DhopCommTime-=usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime+=usecond();
  DhopComputeTime-=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  if (dag == DaggerYes) {
+  if ( dag == DaggerYes ) {
-    PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
-    for (int ss = 0; ss < U._grid->oSites(); ss++) {
+    for(int ss=0;ss<U._grid->oSites();ss++){
-      int sU = ss;
+	int sU=ss;
-      int sF = LLs * sU;
+	int sF=LLs*sU;
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
+	Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
-#ifdef AVX512
+  } else {
-  } else if (stat.is_init() ) {
+PARALLEL_FOR_LOOP
-
+    for(int ss=0;ss<U._grid->oSites();ss++){
    int nthreads;
    stat.start();
 #pragma omp parallel
    {
 #pragma omp master
    nthreads = omp_get_num_threads();
    int mythread = omp_get_thread_num();
    stat.enter(mythread);
 #pragma omp for nowait
    for(int ss=0;ss<U._grid->oSites();ss++) {
      int sU=ss;
      int sF=LLs*sU;
-      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
    stat.exit(mythread);
    }
    stat.accum(nthreads);
 #endif
  } else {
    PARALLEL_FOR_LOOP
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
  }
  DhopComputeTime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@ -454,7 +348,6 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@ -466,7 +359,6 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
@ -484,6 +376,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
 template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -31,157 +31,142 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_FERMION_5D_H
 #define  GRID_QCD_WILSON_FERMION_5D_H
 #include <Grid/Stat.h>
 namespace Grid {
 namespace QCD {
-  ////////////////////////////////////////////////////////////////////////////////
+  namespace QCD {
  // This is the 4d red black case appropriate to support
  //
  // parity = (x+y+z+t)|2;
  // generalised five dim fermions like mobius, zolotarev etc..	
  //
  // i.e. even even contains fifth dim hopping term.
  //
  // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
  ////////////////////////////////////////////////////////////////////////////////
-  class WilsonFermion5DStatic { 
+    ////////////////////////////////////////////////////////////////////////////////
-  public:
+    // This is the 4d red black case appropriate to support
-    // S-direction is INNERMOST and takes no part in the parity.
+    //
-    static const std::vector<int> directions;
+    // parity = (x+y+z+t)|2;
-    static const std::vector<int> displacements;
+    // generalised five dim fermions like mobius, zolotarev etc..	
-    const int npoint = 8;
+    //
-  };
+    // i.e. even even contains fifth dim hopping term.
-  
+    //
-  template<class Impl>
+    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-  class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
+    ////////////////////////////////////////////////////////////////////////////////
-  {
+
-  public:
+    class WilsonFermion5DStatic { 
-    INHERIT_IMPL_TYPES(Impl);
+    public:
-    typedef WilsonKernels<Impl> Kernels;
+      // S-direction is INNERMOST and takes no part in the parity.
-    PmuStat stat;
+      static const std::vector<int> directions;
-    
+      static const std::vector<int> displacements;
-    void Report(void);
+      const int npoint = 8;
-    void ZeroCounters(void);
+    };
-    double DhopCalls;
+
-    double DhopCommTime;
+    template<class Impl>
-    double DhopComputeTime;
+    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
-    
+    {
-    double DerivCalls;
+    public:
-    double DerivCommTime;
+     INHERIT_IMPL_TYPES(Impl);
-    double DerivComputeTime;
+     typedef WilsonKernels<Impl> Kernels;
-    double DerivDhopComputeTime;
+
-    
+      ///////////////////////////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////
+      // Implement the abstract base
-    // Implement the abstract base
+      ///////////////////////////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////
+      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-    GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-    GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-    GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
-    GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+
-    
+      // full checkerboard operations; leave unimplemented as abstract for now
-    // full checkerboard operations; leave unimplemented as abstract for now
+      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-    virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-    virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+
-    
+      // half checkerboard operations; leave unimplemented as abstract for now
-    // half checkerboard operations; leave unimplemented as abstract for now
+      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-    virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-    virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
-    virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+
-    
+      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-    virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-    virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
-    virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
-    virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
-    
+      // These can be overridden by fancy 5d chiral action
-    // These can be overridden by fancy 5d chiral action
+      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-    virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-    virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-    virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
-    
+      // Implement hopping term non-hermitian hopping term; half cb or both
-    // Implement hopping term non-hermitian hopping term; half cb or both
+      // Implement s-diagonal DW
-    // Implement s-diagonal DW
+      void DW    (const FermionField &in, FermionField &out,int dag);
-    void DW    (const FermionField &in, FermionField &out,int dag);
+      void Dhop  (const FermionField &in, FermionField &out,int dag);
-    void Dhop  (const FermionField &in, FermionField &out,int dag);
+      void DhopOE(const FermionField &in, FermionField &out,int dag);
-    void DhopOE(const FermionField &in, FermionField &out,int dag);
+      void DhopEO(const FermionField &in, FermionField &out,int dag);
-    void DhopEO(const FermionField &in, FermionField &out,int dag);
+
-    
+      // add a DhopComm
    // add a DhopComm
      // -- suboptimal interface will presently trigger multiple comms.
-    void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
    ///////////////////////////////////////////////////////////////
    // New methods added 
    ///////////////////////////////////////////////////////////////
    void DerivInternal(StencilImpl & st,
 		       DoubledGaugeField & U,
 		       GaugeField &mat,
 		       const FermionField &A,
 		       const FermionField &B,
 		       int dag);
    void DhopInternal(StencilImpl & st,
 		      LebesgueOrder &lo,
 		      DoubledGaugeField &U,
 		      const FermionField &in, 
 		      FermionField &out,
 		      int dag);
    // Constructors
    WilsonFermion5D(GaugeField &_Umu,
 		    GridCartesian         &FiveDimGrid,
 		    GridRedBlackCartesian &FiveDimRedBlackGrid,
 		    GridCartesian         &FourDimGrid,
 		    GridRedBlackCartesian &FourDimRedBlackGrid,
 		    double _M5,const ImplParams &p= ImplParams());
    // Constructors
    /*
      WilsonFermion5D(int simd, 
      GaugeField &_Umu,
      GridCartesian         &FiveDimGrid,
      GridRedBlackCartesian &FiveDimRedBlackGrid,
      GridCartesian         &FourDimGrid,
      double _M5,const ImplParams &p= ImplParams());
    */
    // DoubleStore
    void ImportGauge(const GaugeField &_Umu);
    ///////////////////////////////////////////////////////////////
    // Data members require to support the functionality
    ///////////////////////////////////////////////////////////////
  public:
    // Add these to the support from Wilson
    GridBase *_FourDimGrid;
    GridBase *_FourDimRedBlackGrid;
    GridBase *_FiveDimGrid;
    GridBase *_FiveDimRedBlackGrid;
    double                        M5;
    int Ls;
    //Defines the stencils for even and odd
    StencilImpl Stencil; 
    StencilImpl StencilEven; 
    StencilImpl StencilOdd; 
    // Copy of the gauge field , with even and odd subsets
    DoubledGaugeField Umu;
    DoubledGaugeField UmuEven;
    DoubledGaugeField UmuOdd;
    LebesgueOrder Lebesgue;
    LebesgueOrder LebesgueEvenOdd;
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
  };
-}}
+      ///////////////////////////////////////////////////////////////
      // New methods added 
      ///////////////////////////////////////////////////////////////
      void DerivInternal(StencilImpl & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);
      void DhopInternal(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // Constructors
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
    public:
      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
      GridBase *_FourDimRedBlackGrid;
      GridBase *_FiveDimGrid;
      GridBase *_FiveDimRedBlackGrid;
      double                        M5;
      int Ls;
      //Defines the stencils for even and odd
      StencilImpl Stencil; 
      StencilImpl StencilEven; 
      StencilImpl StencilOdd; 
      // Copy of the gauge field , with even and odd subsets
      DoubledGaugeField Umu;
      DoubledGaugeField UmuEven;
      DoubledGaugeField UmuOdd;
      LebesgueOrder Lebesgue;
      LebesgueOrder LebesgueEvenOdd;
      // Comms buffer
      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    };
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@ -1,163 +1,98 @@
-/*************************************************************************************
+    /*************************************************************************************
-Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
-Source file: ./lib/qcd/action/fermion/WilsonKernels.h
+    Source file: ./lib/qcd/action/fermion/WilsonKernels.h
-Copyright (C) 2015
+    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-This program is free software; you can redistribute it and/or modify
+    This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
+    it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+    (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+    This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+    GNU General Public License for more details.
-You should have received a copy of the GNU General Public License along
+    You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
+    with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-See the full license in the file "LICENSE" in the top level distribution
+    See the full license in the file "LICENSE" in the top level distribution directory
-directory
+    *************************************************************************************/
-*************************************************************************************/
+    /*  END LEGAL */
-/*  END LEGAL */
+#ifndef  GRID_QCD_DHOP_H
-#ifndef GRID_QCD_DHOP_H
+#define  GRID_QCD_DHOP_H
 #define GRID_QCD_DHOP_H
 namespace Grid {
 namespace QCD {
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  namespace QCD {
-  // Helper routines that implement Wilson stencil for a single site.
+
-  // Common to both the WilsonFermion and WilsonFermion5D
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Helper routines that implement Wilson stencil for a single site.
-class WilsonKernelsStatic { 
+    // Common to both the WilsonFermion and WilsonFermion5D
- public:
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // S-direction is INNERMOST and takes no part in the parity.
+    class WilsonKernelsStatic { 
-  static int AsmOpt;  // these are a temporary hack
+    public:
-  static int HandOpt; // these are a temporary hack
+      // S-direction is INNERMOST and takes no part in the parity.
-};
+      static int AsmOpt;  // these are a temporary hack
- 
+      static int HandOpt; // these are a temporary hack
-template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
+    };
- public:
+
-   
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
-  INHERIT_IMPL_TYPES(Impl);
+    public:
-  typedef FermionOperator<Impl> Base;
+
-   
+     INHERIT_IMPL_TYPES(Impl);
-public:
+     typedef FermionOperator<Impl> Base;
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 #ifdef AVX512
    if (AsmOpt) {
      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
    } else {
 #else
    {
 #endif
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  if (HandOpt)
 	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  else
 	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
    }
  }
-  template <bool EnableBool = true>
+    public:
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
 	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
 	sF++;
      }
      sU++;
    }
  }
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 #ifdef AVX512
    if (AsmOpt) {
      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
    } else {
 #else
    {
 #endif
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  if (HandOpt)
 	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  else
 	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
    }
  }
-  template <bool EnableBool = true>
+     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
+			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
 	WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	sF++;
      }
      sU++;
    }
  }
  void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
-private:
+     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
    private:
     // Specialised variants
-  void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			       int sF, int sU, const FermionField &in, FermionField &out);
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU, const FermionField &in, FermionField &out);
-  void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-				  int sF, int sU, const FermionField &in, FermionField &out);
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
-  void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
  void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
-  void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-			    int sF, int sU, const FermionField &in, FermionField &out);
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out);
     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);
    public:
-  void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+     WilsonKernels(const ImplParams &p= ImplParams());
-			       int sF, int sU, const FermionField &in, FermionField &out);
+     
-      
+    };
 public:
  WilsonKernels(const ImplParams &p = ImplParams());
 };
 }}
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@ -1,4 +1,4 @@
-/*************************************************************************************
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -26,71 +26,59 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
+    *************************************************************************************/
-/*  END LEGAL */
+    /*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
 namespace QCD {
 ///////////////////////////////////////////////////////////
 // Default to no assembler implementation
 ///////////////////////////////////////////////////////////
 template<class Impl> void 
 WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<class Impl> void 
+
-WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+  ///////////////////////////////////////////////////////////
-					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 #if defined(AVX512) 
-    
+
-    ///////////////////////////////////////////////////////////
+
-    // If we are AVX512 specialise the single precision routine
+  ///////////////////////////////////////////////////////////
-    ///////////////////////////////////////////////////////////
+  // If we are AVX512 specialise the single precision routine
-    
+  ///////////////////////////////////////////////////////////
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-    
+
 static Vector<vComplexF> signs;
-    
+
-  int setupSigns(void ){
+int setupSigns(void ){
-    Vector<vComplexF> bother(2);
+  Vector<vComplexF> bother(2);
-    signs = bother;
+  signs = bother;
-    vrsign(signs[0]);
+  vrsign(signs[0]);
-    visign(signs[1]);
+  visign(signs[1]);
-    return 1;
+  return 1;
-  }
+}
-  static int signInit = setupSigns();
+static int signInit = setupSigns();
-  
+
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-  
+
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-  
+template<>
-#undef KERNEL_DAG
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-template<> void 
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-      
+
 #define KERNEL_DAG
 template<> void 
 WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@ -101,38 +89,32 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-				    
+template<>
-#undef KERNEL_DAG
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-template<> void 
+								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-				    
+
 #define KERNEL_DAG
 template<> void 
 WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #endif
-#define INSTANTIATE_ASM(A)\
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 \
 template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 INSTANTIATE_ASM(WilsonImplF);
 INSTANTIATE_ASM(WilsonImplD);
 INSTANTIATE_ASM(ZWilsonImplF);
 INSTANTIATE_ASM(ZWilsonImplD);
 INSTANTIATE_ASM(GparityWilsonImplF);
 INSTANTIATE_ASM(GparityWilsonImplD);
 INSTANTIATE_ASM(DomainWallVec5dImplF);
 INSTANTIATE_ASM(DomainWallVec5dImplD);
 INSTANTIATE_ASM(ZDomainWallVec5dImplF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplD);
 template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@ -30,11 +30,7 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);
 #ifdef KERNEL_DAG
    XP_PROJMEM(base);
 #else 
    XM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@ -45,22 +41,15 @@
    MULT_2SPIN_DIR_PFXP(Xp,basep);
  }
  LOAD64(%r10,isigns);
 #ifdef KERNEL_DAG
  XP_RECON;
 #else
  XM_RECON;
-#endif
+
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    YP_PROJMEM(base);
 #else
    YM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@ -71,11 +60,7 @@
    MULT_2SPIN_DIR_PFYP(Yp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  YP_RECON_ACCUM;
 #else
  YM_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Zp
@ -83,11 +68,7 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    ZP_PROJMEM(base);
 #else
    ZM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@ -98,11 +79,7 @@
    MULT_2SPIN_DIR_PFZP(Zp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  ZP_RECON_ACCUM;
 #else
  ZM_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Tp
@ -110,11 +87,7 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    TP_PROJMEM(base);
 #else
    TM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@ -125,26 +98,16 @@
    MULT_2SPIN_DIR_PFTP(Tp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  TP_RECON_ACCUM;
 #else
  TM_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Xm
  ////////////////////////////////
 #ifndef STREAM_STORE
  basep= (uint64_t) &out._odata[ss];
 #endif
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    XM_PROJMEM(base);
 #else
    XP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@ -155,11 +118,7 @@
    MULT_2SPIN_DIR_PFXM(Xm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  XM_RECON_ACCUM;
 #else
  XP_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Ym
@ -167,11 +126,7 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    YM_PROJMEM(base);
 #else
    YP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@ -182,11 +137,7 @@
    MULT_2SPIN_DIR_PFYM(Ym,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  YM_RECON_ACCUM;
 #else
  YP_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Zm
@ -194,11 +145,7 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    ZM_PROJMEM(base);
 #else
    ZP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@ -209,11 +156,7 @@
    MULT_2SPIN_DIR_PFZM(Zm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  ZM_RECON_ACCUM;
 #else
  ZP_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Tm
@ -221,28 +164,18 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    TM_PROJMEM(base);
 #else
    TP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
  }
  base= (uint64_t) &out._odata[ss];
 #ifndef STREAM_STORE
  PREFETCH_CHIMU(base);
 #endif
  {
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  TM_RECON_ACCUM;
 #else
  TP_RECON_ACCUM;
 #endif
  basep= st.GetPFInfo(nent,plocal); nent++;
  SAVE_RESULT(base,basep);
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -311,9 +311,10 @@ namespace Grid {
 namespace QCD {
-template<class Impl> void 
+template<class Impl>
-WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					  int ss,int sU,const FermionField &in, FermionField &out)
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -554,8 +555,9 @@ WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,Doub
 }
 template<class Impl>
-void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-						  int ss,int sU,const FermionField &in, FermionField &out)
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
  typedef typename Simd::scalar_type S;
@ -796,35 +798,38 @@ void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder
  }
 }
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
-template<> void 
+template<>
-WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							SiteHalfSpinor *buf,
+							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							int sF,int sU,const FermionField &in, FermionField &out)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<> void 
+template<>
-WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							   SiteHalfSpinor *buf,
+								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							   int sF,int sU,const FermionField &in, FermionField &out)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<> void 
+template<>
-WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							int sF,int sU,const FermionField &in, FermionField &out)
+							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<> void 
+template<>
-WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							   int sF,int sU,const FermionField &in, FermionField &out)
+								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
@ -834,21 +839,46 @@ WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,Lebes
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
-#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int sU,const FermionField &in, FermionField &out); \
+							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							int ss,int sU,const FermionField &in, FermionField &out);
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 INSTANTIATE_THEM(WilsonImplF);
 INSTANTIATE_THEM(WilsonImplD);
 INSTANTIATE_THEM(ZWilsonImplF);
 INSTANTIATE_THEM(ZWilsonImplD);
 INSTANTIATE_THEM(GparityWilsonImplF);
 INSTANTIATE_THEM(GparityWilsonImplD);
 INSTANTIATE_THEM(DomainWallVec5dImplF);
 INSTANTIATE_THEM(DomainWallVec5dImplD);
 INSTANTIATE_THEM(ZDomainWallVec5dImplF);
 INSTANTIATE_THEM(ZDomainWallVec5dImplD);
 }}
--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/lib/qcd/action/fermion/WilsonTMFermion.h
@ -28,7 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef  GRID_QCD_WILSON_TM_FERMION_H
 #define  GRID_QCD_WILSON_TM_FERMION_H
-#include <Grid/Grid.h>
+#include <Grid.h>
 namespace Grid {
--- a/lib/qcd/action/fermion/ZMobiusFermion.h
+++ b/lib/qcd/action/fermion/ZMobiusFermion.h
@ -1,79 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_QCD_ZMOBIUS_FERMION_H
 #define  GRID_QCD_ZMOBIUS_FERMION_H
 #include <Grid/Grid.h>
 namespace Grid {
  namespace QCD {
    template<class Impl>
    class ZMobiusFermion : public CayleyFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void) {};
      // Constructors
      ZMobiusFermion(GaugeField &_Umu,
 		     GridCartesian         &FiveDimGrid,
 		     GridRedBlackCartesian &FiveDimRedBlackGrid,
 		     GridCartesian         &FourDimGrid,
 		     GridRedBlackCartesian &FourDimRedBlackGrid,
 		     RealD _mass,RealD _M5,
 		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
      CayleyFermion5D<Impl>(_Umu,
 			    FiveDimGrid,
 			    FiveDimRedBlackGrid,
 			    FourDimGrid,
 			    FourDimRedBlackGrid,_mass,_M5,p)
      {
 	RealD eps = 1.0;
 	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
 	std::vector<Coeff_t> zgamma(this->Ls);
 	for(int s=0;s<this->Ls;s++){
 	  zgamma[s] = gamma[s];
 	}
 	// Call base setter
 	this->SetCoefficientsInternal(1.0,zgamma,b,c);
      }
    };
  }
 }
 #endif
--- a/Show More
+++ b/Show More