Merge branch 'develop' into feature/dwf-multirhs

2025-06-12 20:27:06 +01:00 · 2017-10-02 11:41:49 +01:00
parent ac740f73ce fddeb29d6b
commit 4f8b6f26b4
160 changed files with 23407 additions and 9004 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -9,68 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
@ -106,9 +44,3 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/README.md
+++ b/README.md
@ -1,27 +1,44 @@
-# Grid
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 <table>
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
 **Data parallel C++ mathematical object library.**
 License: GPL v2.
-Last update Nov 2016.
+Last update June 2017.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
 * Identically shaped arrays then be processed with perfect data parallelisation.
 * Such identically shaped arrays are called conformable arrays.
 The transformation is based on the observation that Cartesian array processing involves
 identical processing to be performed on different regions of the Cartesian array.
 The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 Local vector loops are parallelised with OpenMP pragmas.
 Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 ### Compilers
 Intel ICPC v16.0.3 and later
@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 ### Required libraries
 Grid requires:
 [GMP](https://gmplib.org/), 
-### Description
+[MPFR](http://www.mpfr.org/) 
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
-* Identically shaped arrays then be processed with perfect data parallelisation.
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 * Such identically shaped arrays are called conformable arrays.
-The transformation is based on the observation that Cartesian array processing involves
+Grid optionally uses:
 identical processing to be performed on different regions of the Cartesian array.
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+[HDF5](https://support.hdfgroup.org/HDF5/)  
 Local vector loops are parallelised with OpenMP pragmas.
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
 Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+LAPACK either generic version or Intel MKL library.
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
 ### Quick start
 First, start by cloning the repository:
@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 Alternatively, some CPU codenames can be directly used:
@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
-             --enable-comms=mpi-auto \
+             --enable-comms=mpi-auto  \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
-where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi       \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Knight's Landing with Intel Omnipath adapters with two adapters per node 
 presently performs better with use of more than one rank per node, using shared memory 
 for interior communication. This is the mpi3 communications implementation. 
 We recommend four ranks per node for best performance, but optimum is local volume dependent.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
 ```
 ### Build setup for Intel Haswell Xeon platform
 The following configuration is recommended for the Intel Haswell platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ```
        export I_MPI_PIN=1
 ```
 This is the default.
 ### Build setup for Intel Skylake Xeon platform
 The following configuration is recommended for the Intel Skylake platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ``` 
        export I_MPI_PIN=1
 ```
 This is the default. 
 #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 TBA
 ### Build setup for AMD EPYC / RYZEN
 The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
 are common. Each chip within the module exposes a separate NUMA domain.
 There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
 MPI-3 is recommended with the use of four ranks per socket,
 and 8 threads per rank. 
 The following configuration is recommended for the AMD EPYC platform.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
 This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
 It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
 shared memory to communicate within this node:
 mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
 Where omp_bind.sh does the following:
 ```
 #!/bin/bash
 numanode=` expr $PMI_RANK % 8 `
 basecore=`expr $numanode \* 16`
 core0=`expr $basecore + 0 `
 core1=`expr $basecore + 2 `
 core2=`expr $basecore + 4 `
 core3=`expr $basecore + 6 `
 core4=`expr $basecore + 8 `
 core5=`expr $basecore + 10 `
 core6=`expr $basecore + 12 `
 core7=`expr $basecore + 14 `
 export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
 echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 $@
 ```
 Performance:
 #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 TBA
 ### Build setup for BlueGene/Q
 To be written...
 ### Build setup for ARM Neon
 To be written...
 ### Build setup for laptops, other compilers, non-cluster builds
 Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
 and omit the enable-mkl flag. 
 Single node builds are enabled with 
 ```
            --enable-comms=none
 ```
 FFTW support that is not in the default search path may then enabled with
 ```
    --with-fftw=<installpath>
 ```
 BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
--- a/2
+++ b/2
@ -9,10 +9,8 @@ Large item work list:
 3a)- RNG I/O in ILDG/SciDAC (minor)
 3b)- Precision conversion and sort out localConvert      <-- partial/easy
 3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 --
 4)- Physical propagator interface
 5)- Conserved currents
 --
 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 7)- HDCR resume
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -0,0 +1,800 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 };
 Gamma::Algebra Gmu [] = {
  Gamma::Algebra::GammaX,
  Gamma::Algebra::GammaY,
  Gamma::Algebra::GammaZ,
  Gamma::Algebra::GammaT
 };
 struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
  //  int HugePages;
 };
 class Benchmark {
 public:
  static void Decomposition (void ) {
    int threads = GridThread::GetThreads();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  static void Comms(void)
  {
    int Nloop=200;
    int nmu=0;
    int maxlat=32;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();
    for(int lat=4;lat<=maxlat;lat+=4){
      for(int Ls=8;Ls<=8;Ls*=2){
 	std::vector<int> latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
 	RealD ppn = Nrank/Nnode;
 	std::vector<HalfSpinColourVectorD *> xbuf(8);
 	std::vector<HalfSpinColourVectorD *> rbuf(8);
 	Grid.ShmBufferFreeAll();
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	int ncomm;
 	double dbytes;
 	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
 	  double start=usecond();
 	  dbytes=0;
 	  ncomm=0;
 	  parallel_for(int dir=0;dir<8;dir++){
 	    double tbytes;
 	    int mu =dir % 4;
 	    if (mpi_layout[mu]>1 ) {
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
 		int comm_proc=1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      } else { 
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 						 (void *)&rbuf[dir][0], recv_from_rank,
 						 bytes,dir);
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      ncomm++;
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      dbytes+=tbytes;
 	    }
 	  }
 	  Grid.Barrier();
 	  double stop=usecond();
 	  t_time[i] = stop-start; // microseconds
 	}
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
 	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    }
    }    
    return;
  }
  static void Memory(void)
  {
    const int Nvec=8;
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
    typedef iVector<vReal,Nvec> Vec;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    uint64_t NP;
    uint64_t NN;
  uint64_t lmax=48;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      NP= Grid.RankCount();
      NN =Grid.NodeCount();
      Vec rn ; random(sRNG,rn);
      LatticeVec z(&Grid); z=rn;
      LatticeVec x(&Grid); x=rn;
      LatticeVec y(&Grid); y=rn;
      double a=2.0;
      uint64_t Nloop=NLOOP;
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 	       << "\t\t"<< bytes/time/NN <<std::endl;
    }
  };
  static double DWF5(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (sFGrid); random(RNG5,src);
    LatticeFermion tmp   (sFGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    LatticeFermion r_eo  (sFGrid);
    LatticeFermion err   (sFGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 100;
 	uint64_t ncall = 1000;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	sFGrid->Barrier();
 	double t1=usecond();
 	sDw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	sFGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 	sDw.Report();
      }
      double robust = mflops_worst/mflops_best;;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
  static double DWF(int Ls,int L, double & robust)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (FGrid); random(RNG5,src);
    LatticeFermion ref   (FGrid);
    LatticeFermion tmp   (FGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    ////////////////////////////////////
    // Naive wilson implementation
    ////////////////////////////////////
    {
      LatticeGaugeField Umu5d(FGrid); 
      std::vector<LatticeColourMatrix> U(4,FGrid);
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 	}
      }
      ref = zero;
      for(int mu=0;mu<Nd;mu++){
 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
      }
      for(int mu=0;mu<Nd;mu++){
 	tmp = U[mu]*Cshift(src,mu+1,1);
 	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 	tmp =adj(U[mu])*src;
 	tmp =Cshift(tmp,mu+1,-1);
 	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      }
      ref = -0.5*ref;
    }
    LatticeFermion src_e (FrbGrid);
    LatticeFermion src_o (FrbGrid);
    LatticeFermion r_e   (FrbGrid);
    LatticeFermion r_o   (FrbGrid);
    LatticeFermion r_eo  (FGrid);
    LatticeFermion err   (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 200;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 1000;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	Dw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	Dw.Report();
 	Dw.DhopEO(src_o,r_e,DaggerNo);
 	Dw.DhopOE(src_e,r_o,DaggerNo);
 	setCheckerboard(r_eo,r_o);
 	setCheckerboard(r_eo,r_e);
 	err = r_eo-ref; 
 	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 	assert((norm2(err)<1.0e-4));
      }
      robust = mflops_worst/mflops_best;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 #endif
  Benchmark::Decomposition();
  int do_memory=1;
  int do_comms =1;
  int do_su3   =0;
  int do_wilson=1;
  int do_dwf   =1;
  if ( do_su3 ) {
    // empty for now
  }
 #if 1
  int sel=2;
  std::vector<int> L_list({8,12,16,24});
 #else
  int sel=1;
  std::vector<int> L_list({8,12});
 #endif
  int selm1=sel-1;
  std::vector<double> robust_list;
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> dwf5;
  if ( do_wilson ) {
    int Ls=1;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      double robust;
      wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
    }
  }
  int Ls=16;
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      double robust;
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
      dwf4.push_back(result);
      robust_list.push_back(robust);
    }
  }
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
    }
  }
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  int NN=NN_global;
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Memory();
  }
  if ( do_comms && (NN>1) ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Comms();
  }
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
  std::cout<<std::setprecision(3);
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
  int Nloop=100;
  int nmu=0;
-  int maxlat=24;
+  int maxlat=32;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -92,11 +92,16 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      for(int i=0;i<Nloop;i++){
      double start=usecond();
@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@ -163,7 +167,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
@ -172,9 +176,14 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@ -249,7 +258,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -299,7 +308,7 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
@ -310,11 +319,11 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
 	  }
 	}
-	Grid.StencilSendToRecvFromComplete(requests);
+	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
@ -346,7 +355,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -393,8 +402,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
@ -406,8 +415,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
@ -436,5 +445,97 @@ int main (int argc, char ** argv)
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	parallel_for(int dir=0;dir<8;dir++){
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    if ( dir == mu ) { 
 	      int comm_proc=1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    } else { 
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 #pragma omp atomic
 	    dbytes+=tbytes;
 	  }
 	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -51,7 +51,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  int Ls=16;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -165,7 +171,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
+  int ncall =500;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
@ -303,6 +309,7 @@ int main (int argc, char ** argv)
    }
    assert(sum < 1.0e-4);
    if(1){
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
@ -381,8 +388,23 @@ int main (int argc, char ** argv)
      }
      assert(error<1.0e-4);
    }
  if(0){
    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      sDw.Dhop(ssrc,sresult,0);
      PerformanceCounter Counter(i);
      Counter.Start();
      sDw.Dhop(ssrc,sresult,0);
      Counter.Stop();
      Counter.Report();
    }
  }
  }
  if (1)
  { // Naive wilson dag implementation
    ref = zero;
@ -487,9 +509,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  //assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_e)<1.0e-4);
-  //assert(norm2(src_o)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@ -0,0 +1,190 @@
 #include <Grid/Grid.h>
 #include <sstream>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
 typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int Ls=16;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "Ls = " << Ls << std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
  GparityLatticeFermionF src   (FGrid); random(RNG5,src);
  RealD N2 = 1.0/::sqrt(norm2(src));
  src = src*N2;
  GparityLatticeFermionF result(FGrid); result=zero;
  GparityLatticeFermionF    ref(FGrid);    ref=zero;
  GparityLatticeFermionF    tmp(FGrid);
  GparityLatticeFermionF    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeFieldF Umu(UGrid); 
  SU3::HotConfiguration(RNG4,Umu); 
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  RealD NN = UGrid->NodeCount();
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* SINGLE/SINGLE"<<std::endl;
  GparityDomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  int ncall =1000;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=2*1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    Dw.Report();
  }
  std::cout << GridLogMessage<< "* SINGLE/HALF"<<std::endl;
  GparityDomainWallFermionFH DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  if (1) {
    FGrid->Barrier();
    DwH.ZeroCounters();
    DwH.Dhop(src,result,0);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      DwH.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=2*1344*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    DwH.Report();
  }
  GridCartesian         * UGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
  GridCartesian         * FGrid_d   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_d);
  GridRedBlackCartesian * FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_d);
  std::cout << GridLogMessage<< "* DOUBLE/DOUBLE"<<std::endl;
  GparityLatticeFermionD src_d(FGrid_d);
  precisionChange(src_d,src);
  LatticeGaugeFieldD Umu_d(UGrid_d); 
  precisionChange(Umu_d,Umu);
  GparityLatticeFermionD result_d(FGrid_d);
  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
  if (1) {
    FGrid_d->Barrier();
    DwD.ZeroCounters();
    DwD.Dhop(src_d,result_d,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      DwD.Dhop(src_d,result_d,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid_d->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=2*1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    DwD.Report();
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -55,21 +55,21 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=64;
+  uint64_t lmax=96;
-#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
+#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      uint64_t Nloop=NLOOP;
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      double a=2.0;
@ -83,7 +83,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@ -94,17 +94,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      double a=2.0;
      uint64_t Nloop=NLOOP;
@ -119,7 +119,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@ -129,20 +129,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      RealD a=2.0;
@ -154,7 +154,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=2*vol*Nvec*sizeof(Real);
+      double bytes=2.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
@ -166,17 +166,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      RealD a=2.0;
      Real nn;      
      double start=usecond();
@ -187,7 +187,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=vol*Nvec*sizeof(Real);
+      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -37,12 +37,12 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
 #define LMAX (64)
-  int Nloop=20;
+  int64_t Nloop=20;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  int threads = GridThread::GetThreads();
+  int64_t threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@ -54,16 +54,16 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid);// random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid);// random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid);// random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	x=x*y;
      }
      double stop=usecond();
@ -86,17 +86,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	z=x*y;
      }
      double stop=usecond();
@ -117,17 +117,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
      }
      double stop=usecond();
@ -148,17 +148,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mac(z,x,y);
      }
      double stop=usecond();
--- a/configure.ac
+++ b/configure.ac
@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 ################ Set flags
 # do not move!
 CXXFLAGS="-O3 $CXXFLAGS"
 ############### Checks for programs
 AC_PROG_CXX
 AC_PROG_RANLIB
@ -27,7 +31,6 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
 CXXFLAGS="-g $CXXFLAGS"
 ############### Checks for typedefs, structures, and compiler characteristics
@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 ############## Standard libraries
 AC_CHECK_LIB([m],[cos])
 AC_CHECK_LIB([stdc++],[abort])
 ############### GMP and MPFR
 AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 AC_SEARCH_LIBS([crc32], [z],
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
-               [have_zlib=true],
+               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 AC_SEARCH_LIBS([move_pages], [numa],
               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
 	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
               [have_hdf5=true]
@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-march=knl';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      NEONv8)
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
        SIMD_FLAGS='-march=armv8-a';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@ -313,8 +331,41 @@ case ${ac_PRECISION} in
     double)
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
     *)
     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
     ;;
 esac
 ######################  Shared memory allocation technique under MPI3
 AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 case ${ac_SHM} in
     shmget)
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;
     shmopen)
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
     ;;
     hugetlbfs)
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
     ;;
     *)
     AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
     ;;
 esac
 ######################  Shared base path for SHMMMAP
 AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
              [Select SHM mmap base path for hugetlbfs])],
 	      [ac_SHMPATH=${enable_shmpath}],
 	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
@ -324,14 +375,14 @@ case ${ac_COMMS} in
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
     mpi3l*)
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
       comms_type='mpi3l'
     ;;
     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpit)
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
        comms_type='mpit'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
@ -359,7 +410,7 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
@ -464,6 +515,8 @@ compiler version            : ${ax_cv_gxx_version}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Communications type         : ${comms_type}
 Shared memory allocator     : ${ac_SHM}
 Shared memory mmap path     : ${ac_SHMPATH}
 Default precision           : ${ac_PRECISION}
 Software FP16 conversion    : ${ac_SFW_FP16}
 RNG choice                  : ${ac_RNG}
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@ -41,9 +41,10 @@ using namespace Hadrons;
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
-    nd_ = GridDefaultLatt().size();
+    dim_ = GridDefaultLatt();
    nd_  = dim_.size();
    grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
+        dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
        GridDefaultMpi()));
    gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
    auto loc = getGrid()->LocalDimensions();
@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const
    return nd_;
 }
 std::vector<int> Environment::getDim(void) const
 {
    return dim_;
 }
 int Environment::getDim(const unsigned int mu) const
 {
    return dim_[mu];
 }
 // random number generator /////////////////////////////////////////////////////
 void Environment::setSeed(const std::vector<int> &seed)
 {
@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const
    return getModuleType(getModuleAddress(name));
 }
 std::string Environment::getModuleNamespace(const unsigned int address) const
 {
    std::string type = getModuleType(address), ns;
    auto pos2 = type.rfind("::");
    auto pos1 = type.rfind("::", pos2 - 2);
    return type.substr(pos1 + 2, pos2 - pos1 - 2);
 }
 std::string Environment::getModuleNamespace(const std::string name) const
 {
    return getModuleNamespace(getModuleAddress(name));
 }
 bool Environment::hasModule(const unsigned int address) const
 {
    return (address < module_.size());
@ -492,7 +518,14 @@ std::string Environment::getObjectType(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
    {
-        return typeName(object_[address].type);
+        if (object_[address].type)
        {
            return typeName(object_[address].type);
        }
        else
        {
            return "<no type>";
        }
    }
    else if (hasObject(address))
    {
@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const
    return getObjectSize(getObjectAddress(name));
 }
 unsigned int Environment::getObjectModule(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].module;
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 unsigned int Environment::getObjectModule(const std::string name) const
 {
    return getObjectModule(getObjectAddress(name));
 }
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@ -106,6 +106,8 @@ public:
    void                    createGrid(const unsigned int Ls);
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
    // random number generator
    void                    setSeed(const std::vector<int> &seed);
@ -131,6 +133,8 @@ public:
    std::string             getModuleName(const unsigned int address) const;
    std::string             getModuleType(const unsigned int address) const;
    std::string             getModuleType(const std::string name) const;
    std::string             getModuleNamespace(const unsigned int address) const;
    std::string             getModuleNamespace(const std::string name) const;
    bool                    hasModule(const unsigned int address) const;
    bool                    hasModule(const std::string name) const;
    Graph<unsigned int>     makeModuleGraph(void) const;
@ -171,6 +175,8 @@ public:
    std::string             getObjectType(const std::string name) const;
    Size                    getObjectSize(const unsigned int address) const;
    Size                    getObjectSize(const std::string name) const;
    unsigned int            getObjectModule(const unsigned int address) const;
    unsigned int            getObjectModule(const std::string name) const;
    unsigned int            getObjectLs(const unsigned int address) const;
    unsigned int            getObjectLs(const std::string name) const;
    bool                    hasObject(const unsigned int address) const;
@ -181,6 +187,10 @@ public:
    bool                    hasCreatedObject(const std::string name) const;
    bool                    isObject5d(const unsigned int address) const;
    bool                    isObject5d(const std::string name) const;
    template <typename T>
    bool                    isObjectOfType(const unsigned int address) const;
    template <typename T>
    bool                    isObjectOfType(const std::string name) const;
    Environment::Size       getTotalSize(void) const;
    void                    addOwnership(const unsigned int owner,
                                         const unsigned int property);
@ -197,6 +207,7 @@ private:
    bool                                   dryRun_{false};
    unsigned int                           traj_, locVol_;
    // grids
    std::vector<int>                       dim_;
    GridPt                                 grid4d_;
    std::map<unsigned int, GridPt>         grid5d_;
    GridRbPt                               gridRb4d_;
@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const
        else
        {
            HADRON_ERROR("object with address " + std::to_string(address) +
-                         " does not have type '" + typeid(T).name() +
+                         " does not have type '" + typeName(&typeid(T)) +
                         "' (has type '" + getObjectType(address) + "')");
        }
    }
@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name)
    return createLattice<T>(getObjectAddress(name));
 }
 template <typename T>
 bool Environment::isObjectOfType(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
    {
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address) +
                     " exists but is not registered");
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 template <typename T>
 bool Environment::isObjectOfType(const std::string name) const
 {
    return isObjectOfType<T>(getObjectAddress(name));
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Environment_hpp_
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@ -51,23 +51,43 @@ using Grid::operator<<;
 * error with GCC 5 (clang & GCC 6 compile fine without it).
 */
 // FIXME: find a way to do that in a more general fashion
 #ifndef FIMPL
 #define FIMPL WilsonImplR
 #endif
 #ifndef SIMPL
 #define SIMPL ScalarImplCR
 #endif
 BEGIN_HADRONS_NAMESPACE
 // type aliases
-#define TYPE_ALIASES(FImpl, suffix)\
+#define FERM_TYPE_ALIASES(FImpl, suffix)\
 typedef FermionOperator<FImpl>                       FMat##suffix;             \
 typedef typename FImpl::FermionField                 FermionField##suffix;     \
 typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \
 typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
-typedef typename FImpl::DoubledGaugeField            DoubledGaugeField##suffix;\
+typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \
-typedef std::function<void(FermionField##suffix &,                             \
+                                                     SlicedPropagator##suffix;
 #define GAUGE_TYPE_ALIASES(FImpl, suffix)\
 typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
 #define SCALAR_TYPE_ALIASES(SImpl, suffix)\
 typedef typename SImpl::Field ScalarField##suffix;\
 typedef typename SImpl::Field PropagatorField##suffix;
 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef std::function<void(FermionField##suffix &,\
                      const FermionField##suffix &)> SolverFn##suffix;
 #define SINK_TYPE_ALIASES(suffix)\
 typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
 #define FGS_TYPE_ALIASES(FImpl, suffix)\
 FERM_TYPE_ALIASES(FImpl, suffix)\
 GAUGE_TYPE_ALIASES(FImpl, suffix)\
 SOLVER_TYPE_ALIASES(FImpl, suffix)
 // logger
 class HadronsLogger: public Logger
 {
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@ -1,31 +1,3 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
@ -36,13 +8,18 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 #include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 #include <Grid/Hadrons/Modules/Quark.hpp>
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/extras/Hadrons/Modules/MAction/DWF.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_DWF_hpp_
+#ifndef Hadrons_MAction_DWF_hpp_
-#define Hadrons_DWF_hpp_
+#define Hadrons_MAction_DWF_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -56,7 +56,7 @@ template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TDWF(const std::string name);
@ -137,4 +137,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_DWF_hpp_
+#endif // Hadrons_MAction_DWF_hpp_
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/extras/Hadrons/Modules/MAction/Wilson.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Wilson_hpp_
+#ifndef Hadrons_MAction_Wilson_hpp_
-#define Hadrons_Wilson_hpp_
+#define Hadrons_MAction_Wilson_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -54,7 +54,7 @@ template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilson(const std::string name);
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Baryon_hpp_
+#ifndef Hadrons_MContraction_Baryon_hpp_
-#define Hadrons_Baryon_hpp_
+#define Hadrons_MContraction_Baryon_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TBaryon: public Module<BaryonPar>
 {
 public:
-    TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl2, 2);
-    TYPE_ALIASES(FImpl3, 3);
+    FERM_TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
    // FIXME: do contractions
-    write(writer, "meson", result);
+    // write(writer, "meson", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Baryon_hpp_
+#endif // Hadrons_MContraction_Baryon_hpp_
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_DiscLoop_hpp_
+#ifndef Hadrons_MContraction_DiscLoop_hpp_
-#define Hadrons_DiscLoop_hpp_
+#define Hadrons_MContraction_DiscLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -52,7 +52,7 @@ public:
 template <typename FImpl>
 class TDiscLoop: public Module<DiscLoopPar>
 {
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
    class Result: Serializable
    {
    public:
@ -141,4 +141,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_DiscLoop_hpp_
+#endif // Hadrons_MContraction_DiscLoop_hpp_
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Gamma3pt_hpp_
+#ifndef Hadrons_MContraction_Gamma3pt_hpp_
-#define Hadrons_Gamma3pt_hpp_
+#define Hadrons_MContraction_Gamma3pt_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -72,9 +72,9 @@ public:
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TGamma3pt: public Module<Gamma3ptPar>
 {
-    TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl2, 2);
-    TYPE_ALIASES(FImpl3, 3);
+    FERM_TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
@ -167,4 +167,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Gamma3pt_hpp_
+#endif // Hadrons_MContraction_Gamma3pt_hpp_
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Meson_hpp_
+#ifndef Hadrons_MContraction_Meson_hpp_
-#define Hadrons_Meson_hpp_
+#define Hadrons_MContraction_Meson_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -69,7 +69,7 @@ public:
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, gammas,
-                                    std::string, mom,
+                                    std::string, sink,
                                    std::string, output);
 };
@ -77,8 +77,10 @@ template <typename FImpl1, typename FImpl2>
 class TMeson: public Module<MesonPar>
 {
 public:
-    TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl2, 2);
    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
    SINK_TYPE_ALIASES(Scalar);
    class Result: Serializable
    {
    public:
@ -115,7 +117,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name)
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
 {
-    std::vector<std::string> input = {par().q1, par().q2};
+    std::vector<std::string> input = {par().q1, par().q2, par().sink};
    return input;
 }
@ -154,6 +156,9 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 // execution ///////////////////////////////////////////////////////////////////
 #define mesonConnected(q1, q2, gSnk, gSrc) \
 (g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::execute(void)
 {
@ -161,43 +166,72 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                 << std::endl;
-    CorrWriter              writer(par().output);
+    CorrWriter             writer(par().output);
    PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1);
    PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2);
    LatticeComplex         c(env().getGrid());
    Gamma                  g5(Gamma::Algebra::Gamma5);
    std::vector<GammaPair> gammaList;
    std::vector<TComplex>  buf;
    std::vector<Result>    result;
-    std::vector<Real>      p;
+    Gamma                  g5(Gamma::Algebra::Gamma5);
-
+    std::vector<GammaPair> gammaList;
-    p  = strToVec<Real>(par().mom);
+    int                    nt = env().getDim(Tp);
    LatticeComplex         ph(env().getGrid()), coor(env().getGrid());
    Complex                i(0.0,1.0);
    ph = zero;
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    parseGammaString(gammaList);
    result.resize(gammaList.size());
    for (unsigned int i = 0; i < result.size(); ++i)
    {
        Gamma gSnk(gammaList[i].first);
        Gamma gSrc(gammaList[i].second);
        c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
        sliceSum(c, buf, Tp);
        result[i].gamma_snk = gammaList[i].first;
        result[i].gamma_src = gammaList[i].second;
-        result[i].corr.resize(buf.size());
+        result[i].corr.resize(nt);
-        for (unsigned int t = 0; t < buf.size(); ++t)
+    }
    if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
        env().template isObjectOfType<SlicedPropagator2>(par().q2))
    {
        SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
        SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
        LOG(Message) << "(propagator already sinked)" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
-            result[i].corr[t] = TensorRemove(buf[t]);
+            Gamma gSnk(gammaList[i].first);
            Gamma gSrc(gammaList[i].second);
            for (unsigned int t = 0; t < buf.size(); ++t)
            {
                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
            }
        }
    }
    else
    {
        PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1);
        PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2);
        LatticeComplex   c(env().getGrid());
        LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
            Gamma       gSnk(gammaList[i].first);
            Gamma       gSrc(gammaList[i].second);
            std::string ns;
            ns = env().getModuleNamespace(env().getObjectModule(par().sink));
            if (ns == "MSource")
            {
                PropagatorField1 &sink =
                    *env().template getObject<PropagatorField1>(par().sink);
                c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
                sliceSum(c, buf, Tp);
            }
            else if (ns == "MSink")
            {
                SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
                c   = trace(mesonConnected(q1, q2, gSnk, gSrc));
                buf = sink(c);
            }
            for (unsigned int t = 0; t < buf.size(); ++t)
            {
                result[i].corr[t] = TensorRemove(buf[t]);
            }
        }
    }
    write(writer, "meson", result);
@ -207,4 +241,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Meson_hpp_
+#endif // Hadrons_MContraction_Meson_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakHamiltonian_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
-#define Hadrons_WeakHamiltonian_hpp_
+#define Hadrons_MContraction_WeakHamiltonian_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -83,7 +83,7 @@ public:
 class T##modname: public Module<WeakHamiltonianPar>\
 {\
 public:\
-    TYPE_ALIASES(FIMPL,)\
+    FERM_TYPE_ALIASES(FIMPL,)\
    class Result: Serializable\
    {\
    public:\
@ -111,4 +111,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakHamiltonian_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonian_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakHamiltonianEye_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
-#define Hadrons_WeakHamiltonianEye_hpp_
+#define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@ -55,4 +55,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakHamiltonianEye_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakHamiltonianNonEye_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
-#define Hadrons_WeakHamiltonianNonEye_hpp_
+#define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@ -54,4 +54,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakHamiltonianNonEye_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakNeutral4ptDisc_hpp_
+#ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
-#define Hadrons_WeakNeutral4ptDisc_hpp_
+#define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@ -56,4 +56,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakNeutral4ptDisc_hpp_
+#endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@ -1,34 +1,5 @@
-/*************************************************************************************
+#ifndef Hadrons_MFermion_GaugeProp_hpp_
-
+#define Hadrons_MFermion_GaugeProp_hpp_
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/Quark.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Quark_hpp_
 #define Hadrons_Quark_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                               TQuark                                       *
+ *                                GaugeProp                                   *
 ******************************************************************************/
-class QuarkPar: Serializable
+BEGIN_MODULE_NAMESPACE(MFermion)
 class GaugePropPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
                                    std::string, source,
                                    std::string, solver);
 };
 template <typename FImpl>
-class TQuark: public Module<QuarkPar>
+class TGaugeProp: public Module<GaugePropPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
-    TQuark(const std::string name);
+    TGaugeProp(const std::string name);
    // destructor
-    virtual ~TQuark(void) = default;
+    virtual ~TGaugeProp(void) = default;
-    // dependencies/products
+    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
@ -69,20 +42,20 @@ private:
    SolverFn     *solver_{nullptr};
 };
-MODULE_REGISTER(Quark, TQuark<FIMPL>);
+MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 /******************************************************************************
- *                          TQuark implementation                             *
+ *                      TGaugeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TQuark<FImpl>::TQuark(const std::string name)
+TGaugeProp<FImpl>::TGaugeProp(const std::string name)
-: Module(name)
+: Module<GaugePropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getInput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().solver};
@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void)
 }
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getOutput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName(), getName() + "_5d"};
@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::setup(void)
+void TGaugeProp<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().solver);
    env().template registerLattice<PropagatorField>(getName());
@ -111,13 +84,13 @@ void TQuark<FImpl>::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::execute(void)
+void TGaugeProp<FImpl>::execute(void)
 {
    LOG(Message) << "Computing quark propagator '" << getName() << "'"
-                 << std::endl;
+    << std::endl;
    FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
-                    tmp(env().getGrid());
+    tmp(env().getGrid());
    std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
    PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
@ -128,7 +101,7 @@ void TQuark<FImpl>::execute(void)
    }
    LOG(Message) << "Inverting using solver '" << par().solver
-                 << "' on source '" << par().source << "'" << std::endl;
+    << "' on source '" << par().source << "'" << std::endl;
    for (unsigned int s = 0; s < Ns; ++s)
    for (unsigned int c = 0; c < Nc; ++c)
    {
@ -170,7 +143,7 @@ void TQuark<FImpl>::execute(void)
        if (Ls_ > 1)
        {
            PropagatorField &p4d =
-                *env().template getObject<PropagatorField>(getName());
+            *env().template getObject<PropagatorField>(getName());
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void)
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Quark_hpp_
+#endif // Hadrons_MFermion_GaugeProp_hpp_
--- a/extras/Hadrons/Modules/MGauge/Load.hpp
+++ b/extras/Hadrons/Modules/MGauge/Load.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Load_hpp_
+#ifndef Hadrons_MGauge_Load_hpp_
-#define Hadrons_Load_hpp_
+#define Hadrons_MGauge_Load_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -70,4 +70,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Load_hpp_
+#endif // Hadrons_MGauge_Load_hpp_
--- a/extras/Hadrons/Modules/MGauge/Random.hpp
+++ b/extras/Hadrons/Modules/MGauge/Random.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Random_hpp_
+#ifndef Hadrons_MGauge_Random_hpp_
-#define Hadrons_Random_hpp_
+#define Hadrons_MGauge_Random_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Random_hpp_
+#endif // Hadrons_MGauge_Random_hpp_
--- a/extras/Hadrons/Modules/MGauge/StochEm.cc
+++ b/extras/Hadrons/Modules/MGauge/StochEm.cc
@ -0,0 +1,88 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 /******************************************************************************
 *                  TStochEm implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TStochEm::TStochEm(const std::string name)
 : Module<StochEmPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TStochEm::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 std::vector<std::string> TStochEm::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TStochEm::setup(void)
 {
    if (!env().hasRegisteredObject("_" + getName() + "_weight"))
    {
        env().registerLattice<EmComp>("_" + getName() + "_weight");
    }
    env().registerLattice<EmField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TStochEm::execute(void)
 {
    PhotonR photon(par().gauge, par().zmScheme);
    EmField &a = *env().createLattice<EmField>(getName());
    EmComp  *w;
    if (!env().hasCreatedObject("_" + getName() + "_weight"))
    {
        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
                     << par().gauge << ", zero-mode scheme: "
                     << par().zmScheme << ")..." << std::endl;
        w = env().createLattice<EmComp>("_" + getName() + "_weight");
        photon.StochasticWeight(*w);
    }
    else
    {
        w = env().getObject<EmComp>("_" + getName() + "_weight");
    }
    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
    photon.StochasticField(a, *env().get4dRng(), *w);
 }
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@ -0,0 +1,75 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_StochEm_hpp_
 #define Hadrons_MGauge_StochEm_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         StochEm                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 class StochEmPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                    PhotonR::Gauge,    gauge,
                                    PhotonR::ZmScheme, zmScheme);
 };
 class TStochEm: public Module<StochEmPar>
 {
 public:
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TStochEm(const std::string name);
    // destructor
    virtual ~TStochEm(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_StochEm_hpp_
--- a/extras/Hadrons/Modules/MGauge/Unit.hpp
+++ b/extras/Hadrons/Modules/MGauge/Unit.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Unit_hpp_
+#ifndef Hadrons_MGauge_Unit_hpp_
-#define Hadrons_Unit_hpp_
+#define Hadrons_MGauge_Unit_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Unit_hpp_
+#endif // Hadrons_MGauge_Unit_hpp_
--- a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+++ b/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_NoiseLoop_hpp_
+#ifndef Hadrons_MLoop_NoiseLoop_hpp_
-#define Hadrons_NoiseLoop_hpp_
+#define Hadrons_MLoop_NoiseLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -65,7 +65,7 @@ template <typename FImpl>
 class TNoiseLoop: public Module<NoiseLoopPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TNoiseLoop(const std::string name);
@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_NoiseLoop_hpp_
+#endif // Hadrons_MLoop_NoiseLoop_hpp_
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@ -0,0 +1,226 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                     TChargedProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TChargedProp::TChargedProp(const std::string name)
 : Module<ChargedPropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TChargedProp::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().emField};
    return in;
 }
 std::vector<std::string> TChargedProp::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TChargedProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
    phaseName_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
    GFSrcName_ = "_" + getName() + "_DinvSrc";
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
    }
    if (!env().hasRegisteredObject(phaseName_[0]))
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            env().registerLattice<ScalarField>(phaseName_[mu]);
        }
    }
    if (!env().hasRegisteredObject(GFSrcName_))
    {
        env().registerLattice<ScalarField>(GFSrcName_);
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
    // CACHING ANALYTIC EXPRESSIONS
    ScalarField &source = *env().getObject<ScalarField>(par().source);
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    // cache free scalar propagator
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass);
    }
    else
    {
        freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
    }
    // cache G*F*src
    if (!env().hasCreatedObject(GFSrcName_))
    {
        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
    }
    else
    {
        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
    }
    // cache phases
    if (!env().hasCreatedObject(phaseName_[0]))
    {
        std::vector<int> &l = env().getGrid()->_fdimensions;
        LOG(Message) << "Caching shift phases..." << std::endl;
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            Real    twoPiL = M_PI*2./l[mu];
            phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
            LatticeCoordinate(*(phase_[mu]), mu);
            *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
        }
    }
    else
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
        }
    }
    // PROPAGATOR CALCULATION
    LOG(Message) << "Computing charged scalar propagator"
                 << " (mass= " << par().mass
                 << ", charge= " << par().charge << ")..." << std::endl;
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField buf(env().getGrid());
    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
    double      q = par().charge;
    // G*F*Src
    prop = GFSrc;
    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
    buf = GFSrc;
    momD1(buf, fft);
    buf = G*buf;
    prop = prop - q*buf;
    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
    momD1(buf, fft);
    prop = prop + q*q*G*buf;
    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
    buf = GFSrc;
    momD2(buf, fft);
    prop = prop - q*q*G*buf;
    // final FT
    fft.FFT_all_dim(prop, prop, FFT::backward);
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        std::string           filename = par().output + "." +
                                         std::to_string(env().getTrajectory());
        LOG(Message) << "Saving zero-momentum projection to '"
                     << filename << "'..." << std::endl;
        CorrWriter            writer(filename);
        std::vector<TComplex> vecBuf;
        std::vector<Complex>  result;
        sliceSum(prop, vecBuf, Tp);
        result.resize(vecBuf.size());
        for (unsigned int t = 0; t < vecBuf.size(); ++t)
        {
            result[t] = TensorRemove(vecBuf[t]);
        }
        write(writer, "charge", q);
        write(writer, "prop", result);
    }
 }
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    Complex     ci(0.0,1.0);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result - ci*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + ci*adj(*phase_[mu])*buf;
    }
    s = result;
 }
 void TChargedProp::momD2(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + .5*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);        
        buf = Amu*Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + .5*adj(*phase_[mu])*buf;
    }
    s = result;
 }
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@ -0,0 +1,61 @@
 #ifndef Hadrons_MScalar_ChargedProp_hpp_
 #define Hadrons_MScalar_ChargedProp_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Charged scalar propagator                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class ChargedPropPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
                                    std::string, emField,
                                    std::string, source,
                                    double,      mass,
                                    double,      charge,
                                    std::string, output);
 };
 class TChargedProp: public Module<ChargedPropPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TChargedProp(const std::string name);
    // destructor
    virtual ~TChargedProp(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void momD1(ScalarField &s, FFT &fft);
    void momD2(ScalarField &s, FFT &fft);
 private:
    std::string                freeMomPropName_, GFSrcName_;
    std::vector<std::string>   phaseName_;
    ScalarField                *freeMomProp_, *GFSrc_;
    std::vector<ScalarField *> phase_;
    EmField                    *A;
 };
 MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_ChargedProp_hpp_
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@ -0,0 +1,79 @@
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                        TFreeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TFreeProp::TFreeProp(const std::string name)
 : Module<FreePropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TFreeProp::getInput(void)
 {
    std::vector<std::string> in = {par().source};
    return in;
 }
 std::vector<std::string> TFreeProp::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TFreeProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TFreeProp::execute(void)
 {
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField &source = *env().getObject<ScalarField>(par().source);
    ScalarField *freeMomProp;
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass);
    }
    else
    {
        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
    }
    LOG(Message) << "Computing free scalar propagator..." << std::endl;
    SIMPL::FreePropagator(source, prop, *freeMomProp);
    if (!par().output.empty())
    {
        TextWriter            writer(par().output + "." +
                                     std::to_string(env().getTrajectory()));
        std::vector<TComplex> buf;
        std::vector<Complex>  result;
        sliceSum(prop, buf, Tp);
        result.resize(buf.size());
        for (unsigned int t = 0; t < buf.size(); ++t)
        {
            result[t] = TensorRemove(buf[t]);
        }
        write(writer, "prop", result);
    }
 }
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@ -0,0 +1,50 @@
 #ifndef Hadrons_MScalar_FreeProp_hpp_
 #define Hadrons_MScalar_FreeProp_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                               FreeProp                                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class FreePropPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
                                    std::string, source,
                                    double,      mass,
                                    std::string, output);
 };
 class TFreeProp: public Module<FreePropPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
 public:
    // constructor
    TFreeProp(const std::string name);
    // destructor
    virtual ~TFreeProp(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    std::string freeMomPropName_;
 };
 MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_FreeProp_hpp_
--- a/extras/Hadrons/Modules/MScalar/Scalar.hpp
+++ b/extras/Hadrons/Modules/MScalar/Scalar.hpp
@ -0,0 +1,6 @@
 #ifndef Hadrons_Scalar_hpp_
 #define Hadrons_Scalar_hpp_
 #define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
 #endif // Hadrons_Scalar_hpp_
--- a/extras/Hadrons/Modules/MSink/Point.hpp
+++ b/extras/Hadrons/Modules/MSink/Point.hpp
@ -0,0 +1,114 @@
 #ifndef Hadrons_MSink_Point_hpp_
 #define Hadrons_MSink_Point_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                   Point                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSink)
 class PointPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar,
                                    std::string, mom);
 };
 template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SINK_TYPE_ALIASES();
 public:
    // constructor
    TPoint(const std::string name);
    // destructor
    virtual ~TPoint(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
 MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
 /******************************************************************************
 *                          TPoint implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TPoint<FImpl>::TPoint(const std::string name)
 : Module<PointPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TPoint<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TPoint<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::setup(void)
 {
    unsigned int size;
    size = env().template lattice4dSize<LatticeComplex>();
    env().registerObject(getName(), size);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::execute(void)
 {
    std::vector<Real> p = strToVec<Real>(par().mom);
    LatticeComplex    ph(env().getGrid()), coor(env().getGrid());
    Complex           i(0.0,1.0);
    LOG(Message) << "Setting up point sink function for momentum ["
                 << par().mom << "]" << std::endl;
    ph = zero;
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    auto sink = [ph](const PropagatorField &field)
    {
        SlicedPropagator res;
        PropagatorField  tmp = ph*field;
        sliceSum(tmp, res, Tp);
        return res;
    };
    env().setObject(getName(), new SinkFn(sink));
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSink_Point_hpp_
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_RBPrecCG_hpp_
+#ifndef Hadrons_MSolver_RBPrecCG_hpp_
-#define Hadrons_RBPrecCG_hpp_
+#define Hadrons_MSolver_RBPrecCG_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -53,7 +53,7 @@ template <typename FImpl>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TRBPrecCG(const std::string name);
@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_RBPrecCG_hpp_
+#endif // Hadrons_MSolver_RBPrecCG_hpp_
--- a/extras/Hadrons/Modules/MSource/Point.hpp
+++ b/extras/Hadrons/Modules/MSource/Point.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Point_hpp_
+#ifndef Hadrons_MSource_Point_hpp_
-#define Hadrons_Point_hpp_
+#define Hadrons_MSource_Point_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -63,7 +63,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TPoint(const std::string name);
@ -78,7 +78,8 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource);
+MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSource);
 MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource);
 /******************************************************************************
 *                       TPoint template implementation                       *
@ -132,4 +133,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Point_hpp_
+#endif // Hadrons_MSource_Point_hpp_
--- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqGamma.hpp
@ -28,8 +28,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_SeqGamma_hpp_
+#ifndef Hadrons_MSource_SeqGamma_hpp_
-#define Hadrons_SeqGamma_hpp_
+#define Hadrons_MSource_SeqGamma_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -72,7 +72,7 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TSeqGamma(const std::string name);
@ -161,4 +161,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_SeqGamma_hpp_
+#endif // Hadrons_MSource_SeqGamma_hpp_
--- a/extras/Hadrons/Modules/MSource/Wall.hpp
+++ b/extras/Hadrons/Modules/MSource/Wall.hpp
@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WallSource_hpp_
+#ifndef Hadrons_MSource_WallSource_hpp_
-#define Hadrons_WallSource_hpp_
+#define Hadrons_MSource_WallSource_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -64,7 +64,7 @@ template <typename FImpl>
 class TWall: public Module<WallPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWall(const std::string name);
@ -144,4 +144,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WallSource_hpp_
+#endif // Hadrons_MSource_WallSource_hpp_
--- a/extras/Hadrons/Modules/MSource/Z2.hpp
+++ b/extras/Hadrons/Modules/MSource/Z2.hpp
@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Z2_hpp_
+#ifndef Hadrons_MSource_Z2_hpp_
-#define Hadrons_Z2_hpp_
+#define Hadrons_MSource_Z2_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -67,7 +67,7 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZ2(const std::string name);
@ -82,7 +82,8 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource);
+MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource);
 MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
 /******************************************************************************
 *                       TZ2 template implementation                          *
@ -148,4 +149,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Z2_hpp_
+#endif // Hadrons_MSource_Z2_hpp_
--- a/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
+++ b/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
@ -1,5 +1,5 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
+#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
+#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -41,4 +41,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons____FILEBASENAME____hpp_
+#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
--- a/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
+++ b/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
@ -1,5 +1,5 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
+#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
+#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -82,4 +82,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons____FILEBASENAME____hpp_
+#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@ -4,7 +4,10 @@ modules_cc =\
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MGauge/Load.cc \
  Modules/MGauge/Random.cc \
-  Modules/MGauge/Unit.cc
+  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Unit.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MScalar/FreeProp.cc
 modules_hpp =\
  Modules/MAction/DWF.hpp \
@ -17,14 +20,19 @@ modules_hpp =\
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
  Modules/MFermion/GaugeProp.hpp \
  Modules/MGauge/Load.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MLoop/NoiseLoop.hpp \
  Modules/MScalar/ChargedProp.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
  Modules/MSink/Point.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSource/Point.hpp \
  Modules/MSource/SeqGamma.hpp \
  Modules/MSource/Wall.hpp \
-  Modules/MSource/Z2.hpp \
+  Modules/MSource/Z2.hpp
  Modules/Quark.hpp
--- a/extras/qed-fvol/Global.cc
+++ b/extras/qed-fvol/Global.cc
@ -0,0 +1,11 @@
 #include <qed-fvol/Global.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
 QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
 QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
 QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
 QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");
--- a/extras/qed-fvol/Global.hpp
+++ b/extras/qed-fvol/Global.hpp
@ -0,0 +1,42 @@
 #ifndef QedFVol_Global_hpp_
 #define QedFVol_Global_hpp_
 #include <Grid/Grid.h>
 #define BEGIN_QEDFVOL_NAMESPACE \
 namespace Grid {\
 using namespace QCD;\
 namespace QedFVol {\
 using Grid::operator<<;
 #define END_QEDFVOL_NAMESPACE }}
 /* the 'using Grid::operator<<;' statement prevents a very nasty compilation
 * error with GCC (clang compiles fine without it).
 */
 BEGIN_QEDFVOL_NAMESPACE
 class QedFVolLogger: public Logger
 {
 public:
    QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
                                                  GridLogColours, "BLACK"){};
 };
 #define LOG(channel) std::cout << QedFVolLog##channel
 #define QEDFVOL_ERROR(msg)\
 LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
           << __LINE__ << ")" << std::endl;\
 abort();
 #define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 extern QedFVolLogger QedFVolLogError;
 extern QedFVolLogger QedFVolLogWarning;
 extern QedFVolLogger QedFVolLogMessage;
 extern QedFVolLogger QedFVolLogIterative;
 extern QedFVolLogger QedFVolLogDebug;
 END_QEDFVOL_NAMESPACE
 #endif // QedFVol_Global_hpp_
--- a/extras/qed-fvol/Makefile.am
+++ b/extras/qed-fvol/Makefile.am
@ -0,0 +1,9 @@
 AM_CXXFLAGS += -I$(top_srcdir)/extras
 bin_PROGRAMS = qed-fvol
 qed_fvol_SOURCES =   \
    qed-fvol.cc      \
    Global.cc
 qed_fvol_LDADD   = -lGrid
--- a/extras/qed-fvol/WilsonLoops.h
+++ b/extras/qed-fvol/WilsonLoops.h
@ -0,0 +1,265 @@
 #ifndef QEDFVOL_WILSONLOOPS_H
 #define QEDFVOL_WILSONLOOPS_H
 #include <Global.hpp>
 BEGIN_QEDFVOL_NAMESPACE
 template <class Gimpl> class NewWilsonLoops : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
                           const int mu, const int nu) {
    // Annoyingly, must use either scope resolution to find dependent base
    // class,
    // or this-> ; there is no "this" in a static method. This forces explicit
    // Gimpl scope
    // resolution throughout the usage in this file, and rather defeats the
    // purpose of deriving
    // from Gimpl.
    plaq = Gimpl::CovShiftBackward(
        U[mu], mu, Gimpl::CovShiftBackward(
                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
  }
  //////////////////////////////////////////////////
  // trace of directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceDirPlaquette(LatticeComplex &plaq,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
    dirPlaquette(sp, U, mu, nu);
    plaq = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of plaquette
  //////////////////////////////////////////////////
  static void sitePlaquette(LatticeComplex &Plaq,
                            const std::vector<GaugeMat> &U) {
    LatticeComplex sitePlaq(U[0]._grid);
    Plaq = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static Real sumPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Plaq(Umu._grid);
    sitePlaquette(Plaq, U);
    TComplex Tp = sum(Plaq);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static Real avgPlaquette(const GaugeLorentz &Umu) {
    int ndim = Umu._grid->_ndimension;
    Real sumplaq = sumPlaquette(Umu);
    Real vol = Umu._grid->gSites();
    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
                           const int Rmu, const int Rnu,
                           const int mu, const int nu) {
    wl = U[nu];
    for(int i = 0; i < Rnu-1; i++){
      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
    }
    for(int i = 0; i < Rnu; i++){
      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
    }
  }
  //////////////////////////////////////////////////
  // trace of Wilson Loop oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceWilsonLoop(LatticeComplex &wl,
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
    GaugeMat sp(U[0]._grid);
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of Wilson loop
  //////////////////////////////////////////////////
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over planes of Wilson loop with length R1
  // in the time direction
  //////////////////////////////////////////////////
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    int ndim = U[0]._grid->_ndimension;
    Wl = zero;
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
    }
  }
  //////////////////////////////////////////////////
  // sum Wilson loop over all planes orthogonal to the time direction
  //////////////////////////////////////////////////
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteSpatialWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
 };
 END_QEDFVOL_NAMESPACE
 #endif // QEDFVOL_WILSONLOOPS_H
--- a/extras/qed-fvol/qed-fvol.cc
+++ b/extras/qed-fvol/qed-fvol.cc
@ -0,0 +1,88 @@
 #include <Global.hpp>
 #include <WilsonLoops.h>
 using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
 typedef PhotonR::GaugeField             EmField;
 typedef PhotonR::GaugeLinkField         EmComp;
 const int NCONFIGS = 10;
 const int NWILSON = 10;
 int main(int argc, char *argv[])
 {
    // parse command line
    std::string parameterFileName;
    if (argc < 2)
    {
        std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
        std::cerr << std::endl;
        std::exit(EXIT_FAILURE);
    }
    parameterFileName = argv[1];
    // initialization
    Grid_init(&argc, &argv);
    QedFVolLogError.Active(GridLogError.isActive());
    QedFVolLogWarning.Active(GridLogWarning.isActive());
    QedFVolLogMessage.Active(GridLogMessage.isActive());
    QedFVolLogIterative.Active(GridLogIterative.isActive());
    QedFVolLogDebug.Active(GridLogDebug.isActive());
    LOG(Message) << "Grid initialized" << std::endl;
    // QED stuff
    std::vector<int> latt_size   = GridDefaultLatt();
    std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    GridCartesian    grid(latt_size,simd_layout,mpi_layout);
    GridParallelRNG  pRNG(&grid);
    PhotonR          photon(PhotonR::Gauge::feynman,
                            PhotonR::ZmScheme::qedL);
    EmField          a(&grid);
    EmField          expA(&grid);
    Complex imag_unit(0, 1);
    Real wlA;
    std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
    pRNG.SeedRandomDevice();
    LOG(Message) << "Wilson loop calculation beginning" << std::endl;
    for(int ic = 0; ic < NCONFIGS; ic++){
        LOG(Message) << "Configuration " << ic <<std::endl;
        photon.StochasticField(a, pRNG);
        // Exponentiate photon field
        expA = exp(imag_unit*a);
        // Calculate Wilson loops
        for(int iw=1; iw<=NWILSON; iw++){
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
            logWlAvg[iw-1] -= 2*log(wlA);
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
            logWlTime[iw-1] -= 2*log(wlA);
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
            logWlSpace[iw-1] -= 2*log(wlA);
        }
    }
    LOG(Message) << "Wilson loop calculation completed" << std::endl;
    // Calculate Wilson loops
    for(int iw=1; iw<=10; iw++){
        LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
        LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
        LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
        LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
    }
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
    return EXIT_SUCCESS;
 }
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif
-if BUILD_COMMS_MPI3L
+if BUILD_COMMS_MPIT
-  extra_sources+=communicator/Communicator_mpi3_leader.cc
+  extra_sources+=communicator/Communicator_mpit.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@ -37,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Chebyshev.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
@ -45,6 +46,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
--- a/lib/algorithms/FFT.h
+++ b/lib/algorithms/FFT.h
@ -230,6 +230,7 @@ namespace Grid {
      // Barrel shift and collect global pencil
      std::vector<int> lcoor(Nd), gcoor(Nd);
      result = source;
      int pc = processor_coor[dim];
      for(int p=0;p<processors[dim];p++) {
        PARALLEL_REGION
        {
@ -240,7 +241,8 @@ namespace Grid {
          for(int idx=0;idx<sgrid->lSites();idx++) {
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
            peekLocalSite(s,result,cbuf);
-            cbuf[dim]+=p*L;
+	    cbuf[dim]+=((pc+p) % processors[dim])*L;
 	    //            cbuf[dim]+=p*L;
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
@ -278,7 +280,6 @@ namespace Grid {
      flops+= flops_call*NN;
      // writing out result
      int pc = processor_coor[dim];
      PARALLEL_REGION
      {
        std::vector<int> clbuf(Nd), cgbuf(Nd);
--- a/lib/algorithms/approx/Forecast.h
+++ b/lib/algorithms/approx/Forecast.h
@ -0,0 +1,152 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/approx/Forecast.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef INCLUDED_FORECAST_H
 #define INCLUDED_FORECAST_H
 namespace Grid {
  // Abstract base class.
  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
  // and returns a forecasted solution to the system D*psi = phi (psi).
  template<class Matrix, class Field>
  class Forecast
  {
    public:
      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
  };
  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
  // used to forecast solutions across poles of the EOFA heatbath.
  //
  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
  template<class Matrix, class Field>
  class ChronoForecast : public Forecast<Matrix,Field>
  {
    public:
      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
      {
        int degree = prev_solns.size();
        Field chi(phi); // forecasted solution
        // Trivial cases
        if(degree == 0){ chi = zero; return chi; }
        else if(degree == 1){ return prev_solns[0]; }
        RealD dot;
        ComplexD xp;
        Field r(phi); // residual
        Field Mv(phi);
        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
        std::vector<Field> MdagMv(degree,phi);
        // Array to hold the matrix elements
        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
        // Solution and source vectors
        std::vector<ComplexD> a(degree);
        std::vector<ComplexD> b(degree);
        // Orthonormalize the vector basis
        for(int i=0; i<degree; i++){
          v[i] *= 1.0/std::sqrt(norm2(v[i]));
          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
        }
        // Perform sparse matrix multiplication and construct rhs
        for(int i=0; i<degree; i++){
          b[i] = innerProduct(v[i],phi);
          Mat.M(v[i],Mv);
          Mat.Mdag(Mv,MdagMv[i]);
          G[i][i] = innerProduct(v[i],MdagMv[i]);
        }
        // Construct the matrix
        for(int j=0; j<degree; j++){
        for(int k=j+1; k<degree; k++){
          G[j][k] = innerProduct(v[j],MdagMv[k]);
          G[k][j] = std::conj(G[j][k]);
        }}
        // Gauss-Jordan elimination with partial pivoting
        for(int i=0; i<degree; i++){
          // Perform partial pivoting
          int k = i;
          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
          if(k != i){
            xp = b[k];
            b[k] = b[i];
            b[i] = xp;
            for(int j=0; j<degree; j++){
              xp = G[k][j];
              G[k][j] = G[i][j];
              G[i][j] = xp;
            }
          }
          // Convert matrix to upper triangular form
          for(int j=i+1; j<degree; j++){
            xp = G[j][i]/G[i][i];
            b[j] -= xp * b[i];
            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
          }
        }
        // Use Gaussian elimination to solve equations and calculate initial guess
        chi = zero;
        r = phi;
        for(int i=degree-1; i>=0; i--){
          a[i] = 0.0;
          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
          a[i] = (b[i]-a[i])/G[i][i];
          chi += a[i]*v[i];
          r -= a[i]*MdagMv[i];
        }
        RealD true_r(0.0);
        ComplexD tmp;
        for(int i=0; i<degree; i++){
          tmp = -b[i];
          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
          tmp = std::conj(tmp)*tmp;
          true_r += std::sqrt(tmp.real());
        }
        RealD error = std::sqrt(norm2(r)/norm2(phi));
        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
        return chi;
      };
  };
 }
 #endif
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Force manifest hermitian to avoid rounding related
-  // Cholesky from Eigen
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
  // There exists a ldlt that is documented as more stable
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 #if 0
  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
  auto  D_ldlt = m_rr.ldlt().vectorD(); 
  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
 #endif
  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // FIXME:: make a sliceMulMatrix to avoid zero vector
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@ -221,12 +232,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
--- a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
@ -0,0 +1,256 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
    Copyright (C) 2015
 Author: Christopher Kelly <ckelly@phys.columbia.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 namespace Grid {
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
  public:
    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
    // Defaults true.
    RealD Tolerance;
    Integer MaxIterations;
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
    Integer ReliableUpdatesPerformed;
    bool DoFinalCleanup; //Final DP cleanup, defaults to true
    Integer IterationsToCleanup; //Final DP cleanup step iterations
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    GridBase* SinglePrecGrid;
    RealD Delta; //reliable update parameter
    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
    LinearOperatorBase<FieldF> *Linop_fallback;
    RealD fallback_transition_tol;
    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
 	Delta(_delta),
 	Linop_f(_Linop_f),
 	Linop_d(_Linop_d),
 	SinglePrecGrid(_sp_grid),
        ErrorOnNoConverge(err_on_no_conv),
 	DoFinalCleanup(true),
 	Linop_fallback(NULL)
    {};
    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
      Linop_fallback = &_Linop_fallback;
      fallback_transition_tol = _fallback_transition_tol;      
    }
    void operator()(const FieldD &src, FieldD &psi) {
      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
      bool using_fallback = false;
      psi.checkerboard = src.checkerboard;
      conformable(psi, src);
      RealD cp, c, a, d, b, ssq, qq, b_pred;
      FieldD p(src);
      FieldD mmp(src);
      FieldD r(src);
      // Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess) == 0);
      Linop_d.HermOpAndNorm(psi, mmp, d, b);
      r = src - mmp;
      p = r;
      a = norm2(p);
      cp = a;
      ssq = norm2(src);
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
      RealD rsq = Tolerance * Tolerance * ssq;
      // Check if guess is really REALLY good :)
      if (cp <= rsq) {
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
 	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	return;
      }
      //Single prec initialization
      FieldF r_f(SinglePrecGrid);
      r_f.checkerboard = r.checkerboard;
      precisionChange(r_f, r);
      FieldF psi_f(r_f);
      psi_f = zero;
      FieldF p_f(r_f);
      FieldF mmp_f(r_f);
      RealD MaxResidSinceLastRelUp = cp; //initial residual    
      std::cout << GridLogIterative << std::setprecision(4)
 		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k = 0;
      int l = 0;
      for (k = 1; k <= MaxIterations; k++) {
 	c = cp;
 	MatrixTimer.Start();
 	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
 	a = c / d;
 	b_pred = a * (a * qq - d) / c;
 	cp = axpy_norm(r_f, -a, mmp_f, r_f);
 	b = cp / c;
 	// Fuse these loops ; should be really easy
 	psi_f = a * p_f + psi_f;
 	//p_f = p_f * b + r_f;
 	LinalgTimer.Stop();
 	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
 		  << " residual " << cp << " target " << rsq << std::endl;
 	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
 	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
 	if(cp > MaxResidSinceLastRelUp){
 	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
 	  MaxResidSinceLastRelUp = cp;
 	}
 	// Stopping condition
 	if (cp <= rsq) {
 	  //Although not written in the paper, I assume that I have to add on the final solution
 	  precisionChange(mmp, psi_f);
 	  psi = psi + mmp;
 	  SolverTimer.Stop();
 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	  p = mmp - src;
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm / srcnorm;
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
 	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 	  IterationsToComplete = k;	
 	  ReliableUpdatesPerformed = l;
 	  if(DoFinalCleanup){
 	    //Do a final CG to cleanup
 	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
 	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
 	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
 	    CG(Linop_d,src,psi);
 	    IterationsToCleanup = CG.IterationsToComplete;
 	  }
 	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
 	  return;
 	}
 	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
 	  precisionChange(mmp, psi_f);
 	  psi = psi + mmp;
 	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	  r = src - mmp;
 	  psi_f = zero;
 	  precisionChange(r_f, r);
 	  cp = norm2(r);
 	  MaxResidSinceLastRelUp = cp;
 	  b = cp/c;
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
 	  l = l+1;
 	}
 	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
 	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
 	  Linop_f_use = Linop_fallback;
 	  using_fallback = true;
 	}
      }
      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
 		<< std::endl;
      if (ErrorOnNoConverge) assert(0);
      IterationsToComplete = k;
      ReliableUpdatesPerformed = l;      
    }    
  };
 };
 #endif
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@ -1,7 +1,5 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 namespace Grid {
@ -11,7 +9,7 @@ int PointerCache::victim;
 void *PointerCache::Insert(void *ptr,size_t bytes) {
-  if (bytes < 4096 ) return NULL;
+  if (bytes < 4096 ) return ptr;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
  return NULL;
 }
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
      }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@ -64,6 +64,8 @@ namespace Grid {
  };
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
@ -92,18 +94,34 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
-
+    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
 #pragma omp parallel for
 #endif
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@ -182,10 +200,19 @@ public:
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
 #pragma omp parallel for schedule(static)
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -187,17 +187,18 @@ public:
    ////////////////////////////////////////////////////////////////
    void show_decomposition(){
-      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
-      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
-      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
-      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
-      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
-      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
-      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
-      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
-      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
-      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
-      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 
    ////////////////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@ -85,73 +85,78 @@ public:
 	      const std::vector<int> &simd_layout,
 	      const std::vector<int> &processor_grid)
    {
-        ///////////////////////
+      ///////////////////////
-        // Grid information
+      // Grid information
-        ///////////////////////
+      ///////////////////////
-        _ndimension = dimensions.size();
+      _ndimension = dimensions.size();
-        _fdimensions.resize(_ndimension);
+      _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
-	_lstart.resize(_ndimension);
+      _lstart.resize(_ndimension);
-	_lend.resize(_ndimension);
+      _lend.resize(_ndimension);
-        _ostride.resize(_ndimension);
+      _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
+      _istride.resize(_ndimension);
-        _fsites = _gsites = _osites = _isites = 1;
+      _fsites = _gsites = _osites = _isites = 1;
-        for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
+      {
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _fdimensions[d] = dimensions[d];   // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _fsites = _fsites * _fdimensions[d];
+        _simd_layout[d] = simd_layout[d];
-	  _gsites = _gsites * _gdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	  //FIXME check for exact division
+        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
-	  // Use a reduced simd grid
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
 	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
 	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 	  _osites  *= _rdimensions[d];
 	  _isites  *= _simd_layout[d];
-	  // Addressing support
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
-	  if ( d==0 ) {
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	    _ostride[d] = 1;
+        _osites *= _rdimensions[d];
-	    _istride[d] = 1;
+        _isites *= _simd_layout[d];
-	  } else {
+
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+        // Addressing support
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
+        if (d == 0)
-	  }
+        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
-        
+        else
-        ///////////////////////
+        {
-        // subplane information
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
-        ///////////////////////
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        _slice_block.resize(_ndimension);
        _slice_stride.resize(_ndimension);
        _slice_nblock.resize(_ndimension);
        int block =1;
        int nblock=1;
        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
        for(int d=0;d<_ndimension;d++){
            nblock/=_rdimensions[d];
            _slice_block[d] =block;
            _slice_stride[d]=_ostride[d]*_rdimensions[d];
            _slice_nblock[d]=nblock;
            block = block*_rdimensions[d];
        }
      }
      ///////////////////////
      // subplane information
      ///////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
      int block = 1;
      int nblock = 1;
      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
      for (int d = 0; d < _ndimension; d++)
      {
        nblock /= _rdimensions[d];
        _slice_block[d] = block;
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
    };
 };
 }
 #endif
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -164,20 +164,20 @@ public:
 #endif
    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
+              const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
+              const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
+              const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              int checker_dim)
    {
-    ///////////////////////
+      ///////////////////////
-    // Grid information
+      // Grid information
-    ///////////////////////
+      ///////////////////////
      _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
+      assert(checker_dim_mask.size() == _ndimension);
-      assert(processor_grid.size()==_ndimension);
+      assert(processor_grid.size() == _ndimension);
-      assert(simd_layout.size()==_ndimension);
+      assert(simd_layout.size() == _ndimension);
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
@ -192,47 +192,55 @@ public:
      _fsites = _gsites = _osites = _isites = 1;
-      _checker_dim_mask=checker_dim_mask;
+      _checker_dim_mask = checker_dim_mask;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	_fdimensions[d] = dimensions[d];
+      {
-	_gdimensions[d] = _fdimensions[d];
+        _fdimensions[d] = dimensions[d];
-	_fsites = _fsites * _fdimensions[d];
+        _gdimensions[d] = _fdimensions[d];
-	_gsites = _gsites * _gdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	if (d==_checker_dim) {
+        if (d == _checker_dim)
-	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
+        {
-	}
+          assert((_gdimensions[d] & 0x1) == 0);
-	_ldimensions[d] = _gdimensions[d]/_processors[d];
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
-	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
+        }
-	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	// Use a reduced simd grid
+        // Use a reduced simd grid
-	_simd_layout[d] = simd_layout[d];
+        _simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
-	assert(_rdimensions[d]>0);
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
        assert(_rdimensions[d] > 0);
-	// all elements of a simd vector must have same checkerboard.
+        // all elements of a simd vector must have same checkerboard.
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
-	if ( _simd_layout[d]>1 ) {
+        if (_simd_layout[d] > 1)
-	  if ( checker_dim_mask[d] ) { 
+        {
-	    assert( (_rdimensions[d]&0x1) == 0 );
+          if (checker_dim_mask[d])
-	  }
+          {
-	}
+            assert((_rdimensions[d] & 0x1) == 0);
-
+          }
-	_osites *= _rdimensions[d];
+        }
 	_isites *= _simd_layout[d];
 	// Addressing support
 	if ( d==0 ) {
 	  _ostride[d] = 1;
 	  _istride[d] = 1;
 	} else {
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
      ////////////////////////////////////////////////////////////////////////////////////////////
@ -242,58 +250,69 @@ public:
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
-      int block =1;
+      int block = 1;
-      int nblock=1;
+      int nblock = 1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	nblock/=_rdimensions[d];
+      {
-	_slice_block[d] =block;
+        nblock /= _rdimensions[d];
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
+        _slice_block[d] = block;
-	_slice_nblock[d]=nblock;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
-	block = block*_rdimensions[d];
+        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	rvol=rvol * _rdimensions[d];
+      {
        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
+      for (int osite = 0; osite < _osites; osite++)
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      {
        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
    };
-protected:
+
  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
-      int idx=0;
+      int idx = 0;
-      for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	if( d==_checker_dim ) {
+      {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
+        if (d == _checker_dim)
-	} else {
+        {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
-	}
+        }
        else
        {
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
        }
      }
      return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
-        int idx=0;
+      int idx = 0;
-        for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	  if( d==_checker_dim ) {
+      {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+        if (d == _checker_dim)
-	  } else { 
+        {
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
-	  }
+        }
-	}
+        else
-        return idx;
+        {
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
        }
      }
      return idx;
    }
 };
 }
 #endif
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
@ -33,8 +37,11 @@ namespace Grid {
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
-uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 int CartesianCommunicator::Hugepages = 0;
 /////////////////////////////////
 // Alloc, free shmem region
@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }
-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
+#if !defined( GRID_COMMS_MPI3) 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-
+#endif
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
-						       void *xmit,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       int xmit_to_rank,
+						     int xmit_to_rank,
-						       void *recv,
+						     void *recv,
-						       int recv_from_rank,
+						     int recv_from_rank,
-						       int bytes)
+						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 #endif
 #if !defined( GRID_COMMS_MPI3) 
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
 #ifdef MAP_HUGETLB
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
 #endif
 #else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 #endif
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 }
 #endif
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
@ -50,12 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
  public:    
-  // 65536 ranks per node adequate for now
+
  ////////////////////////////////////////////
  // Isend/Irecv/Wait, or Sendrecv blocking
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  ///////////////////////////////////////////
  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-
+  ///////////////////////////////////////////
-  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const int MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
  // use explicit huge pages
  static int       Hugepages;
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@ -64,15 +76,19 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  static MPI_Comm communicator_world;
-         MPI_Comm communicator;
+
  MPI_Comm              communicator;
  std::vector<MPI_Comm> communicator_halo;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
@ -118,10 +134,6 @@ class CartesianCommunicator {
  /////////////////////////////////
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  size_t heap_top;
  size_t heap_bytes;
@ -225,14 +237,21 @@ class CartesianCommunicator {
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  double StencilSendToRecvFrom(void *xmit,
-				  void *xmit,
+			       int xmit_to_rank,
-				  int xmit_to_rank,
+			       void *recv,
-				  void *recv,
+			       int recv_from_rank,
-				  int recv_from_rank,
+			       int bytes,int dir);
 				  int bytes);
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
 				    void *recv,
 				    int recv_from_rank,
 				    int bytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -37,11 +37,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
-#ifndef SHM_HUGETLB
+#ifdef HAVE_NUMAIF_H
-#define SHM_HUGETLB 04000
+#include <numaif.h>
 #endif
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@ -197,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmCommBuf = 0;
  ShmCommBufs.resize(ShmSize);
-#if 1
+  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMMMAP
  char shm_name [NAME_MAX];
  for(int r=0;r<ShmSize;r++){
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
    //    printf("Opening file %s \n",shm_name);
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
    if ( fd == -1) { 
      printf("open %s failed\n",shm_name);
      perror("open hugetlbfs");
      exit(0);
    }
    int mmap_flag = MAP_SHARED ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    ShmCommBufs[r] =ptr;
  }
 #endif
  ////////////////////////////////////////////////////////////////////////////////////////////
  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
  // the posix shm virtual file system
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMOPEN
  char shm_name [NAME_MAX];
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
@ -211,10 +251,38 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      int mmap_flag = MAP_SHARED;
-      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+#ifdef MAP_POPULATE 
      mmap_flag |= MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
-      ShmCommBufs[r] =ptr;
+
 // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
 #if 0
 //#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
 	int nodes=1; // numa domain == MCDRAM
 	// Find out if in SNC2,SNC4 mode ?
 #else
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
 	for(uint64_t page=0;page<size;page+=4096){
 	  void *pages = (void *) ( page + (uint64_t)ptr );
 	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
 	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 	  if (ierr && (page==0)) perror("numa relocate command failed");
 	}
 #endif
 	ShmCommBufs[r] =ptr;
    }
  }
@ -236,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      ShmCommBufs[r] =ptr;
    }
  }
-
+#endif
-#else
+  ////////////////////////////////////////////////////////////////////////////////////////////
  // SHMGET SHMAT and SHM_HUGETLB flag
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMGET
  std::vector<int> shmids(ShmSize);
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-      key_t key   = 0x4545 + r;
+      key_t key   = IPC_PRIVATE;
-      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      if (Hugepages) flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	printf("key   %d\n",key);
 	printf("size  %lld\n",size);
 	printf("flags %d\n",flags);
 	perror("shmget");
 	exit(1);
      } else { 
 	printf("shmid: 0x%x\n", shmids[r]);
      }
      printf("shmid: 0x%x\n", shmids[r]);
    }
  }
  MPI_Barrier(ShmComm);
@ -384,8 +463,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@ -608,13 +693,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  }
 }
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       void *xmit,
+						     int dest,
-						       int dest,
+						     void *recv,
-						       void *recv,
+						     int from,
-						       int from,
+						     int bytes,int dir)
 						       int bytes)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  assert(dir < communicator_halo.size());
  MPI_Request xrq;
  MPI_Request rrq;
@ -633,26 +732,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@ -0,0 +1,286 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  int Size; 
  MPI_Comm_size(communicator,&Size);
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  list.push_back(req[0]);
  list.push_back(req[1]);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
  int nreq=waitall.size();
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 };
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
 						    void *recv,
 						    int recv_from_rank,
 						    int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
 }
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
--- a/lib/json/json.hpp
+++ b/lib/json/json.hpp
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
 */
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
+  //  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@ -549,6 +553,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      mat += mat_thread;
    }  
  }
  for(int i=0;i<Nblock;i++){
  for(int j=0;j<Nblock;j++){
    ComplexD sum = mat(i,j);
    FullGrid->GlobalSum(sum);
    mat(i,j)=sum;
  }}
  return;
 }
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@ -29,7 +29,7 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
@ -99,34 +99,38 @@ class BinaryIO {
    NerscChecksum(grid,scalardata,nersc_csum);
  }
-  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
+  template <class fobj>
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
  {
-    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
-
+    uint64_t lsites = grid->lSites();
-    uint64_t lsites              =grid->lSites();
+    if (fbuf.size() == 1)
-    if (fbuf.size()==1) {
+    {
-      lsites=1;
+      lsites = 1;
    }
-#pragma omp parallel
+    #pragma omp parallel
    {
-      uint32_t nersc_csum_thr=0;
+      uint32_t nersc_csum_thr = 0;
-#pragma omp for
+      #pragma omp for
-      for(uint64_t local_site=0;local_site<lsites;local_site++){
+      for (uint64_t local_site = 0; local_site < lsites; local_site++)
-	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
+      {
-	for(uint64_t j=0;j<size32;j++){
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
-	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
+        for (uint64_t j = 0; j < size32; j++)
-	}
+        {
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
        }
      }
-#pragma omp critical
+      #pragma omp critical
      {
-	nersc_csum  += nersc_csum_thr;
+        nersc_csum += nersc_csum_thr;
      }
    }
  }
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
  {
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@ -363,17 +367,21 @@ class BinaryIO {
 	assert(0);
 #endif
      } else {
-	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
+        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
-	std::ifstream fin;
+        std::ifstream fin;
-	fin.open(file,std::ios::binary|std::ios::in);
+        fin.open(file, std::ios::binary | std::ios::in);
-	if ( control & BINARYIO_MASTER_APPEND )  {
+        if (control & BINARYIO_MASTER_APPEND)
-	  fin.seekg(-sizeof(fobj),fin.end);
+        {
-	} else { 
+          fin.seekg(-sizeof(fobj), fin.end);
-	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
+        }
-	}
+        else
-	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
+        {
-	fin.close();
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
        }
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
        assert(fin.fail() == 0);
        fin.close();
      }
      timer.Stop();
@ -405,30 +413,78 @@ class BinaryIO {
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
+        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
+        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
+        std::cout << GridLogMessage << "Checking for errors" << std::endl;
-	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
+        if (ierr != MPI_SUCCESS)
-	MPI_File_close(&fh);
+        {
-	MPI_Type_free(&fileArray);
+          char error_string[BUFSIZ];
-	MPI_Type_free(&localArray);
+          int length_of_error_string, error_class;
          MPI_Error_class(ierr, &error_class);
          MPI_Error_string(error_class, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Error_string(ierr, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
        }
        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
        assert(ierr == 0);
        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
        assert(ierr == 0);
        MPI_File_close(&fh);
        MPI_Type_free(&fileArray);
        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else { 
-	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+        
-	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
+	std::ofstream fout; 
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
-	if ( control & BINARYIO_MASTER_APPEND )  {
+  try {
    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
  } catch (const std::fstream::failure& exc) {
    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
    #ifdef USE_MPI_IO
    MPI_Abort(MPI_COMM_WORLD,1);
    #else
    exit(1);
    #endif
  }
 	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
 		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
  if ( control & BINARYIO_MASTER_APPEND )  {
 	  fout.seekp(0,fout.end);
 	} else {
 	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	}
-	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
+  
  try {
  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
  }
  catch (const std::fstream::failure& exc) {
    std::cout << "Exception in writing file " << file << std::endl;
    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
    #ifdef USE_MPI_IO
    MPI_Abort(MPI_COMM_WORLD,1);
    #else
    exit(1);
    #endif
  }
 	fout.close();
-      }
+  }
-      timer.Stop();
+  timer.Stop();
-    }
+  }
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
@ -442,11 +498,14 @@ class BinaryIO {
    //////////////////////////////////////////////////////////////////////////////
    // Safety check
    //////////////////////////////////////////////////////////////////////////////
-    grid->Barrier();
+    // if the data size is 1 we do not want to sum over the MPI ranks
-    grid->GlobalSum(nersc_csum);
+    if (iodata.size() != 1){
-    grid->GlobalXOR(scidac_csuma);
+      grid->Barrier();
-    grid->GlobalXOR(scidac_csumb);
+      grid->GlobalSum(nersc_csum);
-    grid->Barrier();
+      grid->GlobalXOR(scidac_csuma);
      grid->GlobalXOR(scidac_csumb);
      grid->Barrier();
    }
  }
  /////////////////////////////////////////////////////////////////////////////
@ -546,9 +605,9 @@ class BinaryIO {
    int gsites = grid->gSites();
    int lsites = grid->lSites();
-    uint32_t nersc_csum_tmp;
+    uint32_t nersc_csum_tmp   = 0;
-    uint32_t scidac_csuma_tmp;
+    uint32_t scidac_csuma_tmp = 0;
-    uint32_t scidac_csumb_tmp;
+    uint32_t scidac_csumb_tmp = 0;
    GridStopWatch timer;
--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef AVX512
+#ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
--- a/lib/qcd/action/fermion/AbstractEOFAFermion.h
+++ b/lib/qcd/action/fermion/AbstractEOFAFermion.h
@ -0,0 +1,100 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 #define  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>
 namespace Grid {
 namespace QCD {
  // DJM: Abstract base class for EOFA fermion types.
  // Defines layout of additional EOFA-specific parameters and operators.
  // Use to construct EOFA pseudofermion actions that are agnostic to
  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
  // pseudofermion action with non-EOFA fermion type.
  template<class Impl>
  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
    public:
      INHERIT_IMPL_TYPES(Impl);
    public:
      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
      RealD mq1;
      RealD mq2;
      RealD mq3;
      RealD shift;
      int pm;
      RealD alpha; // Mobius scale
      RealD k;     // EOFA normalization constant
      virtual void Instantiatable(void) = 0;
      // EOFA-specific operations
      // Force user to implement in derived classes
      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
      // Implement derivatives in base class:
      // for EOFA both DWF and Mobius just need d(Dw)/dU
      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
        this->DhopDeriv(mat, U, V, dag);
      };
      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
        this->DhopDerivOE(mat, U, V, dag);
      };
      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
        this->DhopDerivEO(mat, U, V, dag);
      };
      // Recompute 5D coefficients for different value of shift constant
      // (needed for heatbath loop over poles)
      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
      // Constructors
      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
      {
        int Ls = this->Ls;
        this->alpha = _b + _c;
        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
      };
  };
 }}
 #endif
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
-    //    assert(fabs(omega[i])>0.0);
+    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0;i<Ls;i++){
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    //    assert(fabs(bee[i])>0.0);
+    assert(bee[i]!=Coeff_t(0.0));
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
    beo[i]=as[i]*bs[i];
    ceo[i]=-as[i]*cs[i];
@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    if ( i < Ls-1 ) {
      assert(bee[i]!=Coeff_t(0.0));
      assert(bee[0]!=Coeff_t(0.0));
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
      }
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  { 
    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
-      //      assert(fabs(bee[j])>0.0);
+      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
    }
    dee[Ls-1] += delta_d;
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -179,9 +179,9 @@ namespace Grid {
     double MooeeInvTime;
    protected:
-      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
+      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
-      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
-      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
+      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };
  }
--- a/lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermion.cc
@ -0,0 +1,438 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 namespace Grid {
 namespace QCD {
    template<class Impl>
    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
      GaugeField            &_Umu,
      GridCartesian         &FiveDimGrid,
      GridRedBlackCartesian &FiveDimRedBlackGrid,
      GridCartesian         &FourDimGrid,
      GridRedBlackCartesian &FourDimRedBlackGrid,
      RealD _mq1, RealD _mq2, RealD _mq3,
      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
        _shift, _pm, _M5, 1.0, 0.0, p)
    {
        RealD eps = 1.0;
        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
        assert(zdata->n == this->Ls);
        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
        Approx::zolotarev_free(zdata);
    }
    /***************************************************************
    /* Additional EOFA operators only called outside the inverter.
    /* Since speed is not essential, simple axpby-style
    /* implementations should be fine.
    /***************************************************************/
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
    {
        int Ls = this->Ls;
        Din = zero;
        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
    }
    // This is just the identity for DWF
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
    // This is just the identity for DWF
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
    /*****************************************************************************************************/
    template<class Impl>
    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
    {
        int Ls = this->Ls;
        FermionField Din(psi._grid);
        this->Meooe5D(psi, Din);
        this->DW(Din, chi, DaggerNo);
        axpby(chi, 1.0, 1.0, chi, psi);
        this->M5D(psi, chi);
        return(norm2(chi));
    }
    template<class Impl>
    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
    {
        int Ls = this->Ls;
        FermionField Din(psi._grid);
        this->DW(psi, Din, DaggerYes);
        this->MeooeDag5D(Din, chi);
        this->M5Ddag(psi, chi);
        axpby(chi, 1.0, 1.0, chi, psi);
        return(norm2(chi));
    }
    /********************************************************************
    /* Performance critical fermion operators called inside the inverter
    /********************************************************************/
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
    {
        int   Ls    = this->Ls;
        int   pm    = this->pm;
        RealD shift = this->shift;
        RealD mq1   = this->mq1;
        RealD mq2   = this->mq2;
        RealD mq3   = this->mq3;
        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
        Coeff_t shiftp(0.0), shiftm(0.0);
        if(shift != 0.0){
          if(pm == 1){ shiftp = shift*(mq3-mq2); }
          else{ shiftm = -shift*(mq3-mq2); }
        }
        std::vector<Coeff_t> diag(Ls,1.0);
        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
        #if(0)
            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
            for(int i=0; i<diag.size(); ++i){
                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
            }
            for(int i=0; i<upper.size(); ++i){
                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
            }
            for(int i=0; i<lower.size(); ++i){
                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
            }
        #endif
        this->M5D(psi, chi, chi, lower, diag, upper);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
    {
        int   Ls    = this->Ls;
        int   pm    = this->pm;
        RealD shift = this->shift;
        RealD mq1   = this->mq1;
        RealD mq2   = this->mq2;
        RealD mq3   = this->mq3;
        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
        Coeff_t shiftp(0.0), shiftm(0.0);
        if(shift != 0.0){
          if(pm == 1){ shiftp = shift*(mq3-mq2); }
          else{ shiftm = -shift*(mq3-mq2); }
        }
        std::vector<Coeff_t> diag(Ls,1.0);
        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
        #if(0)
            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
            for(int i=0; i<diag.size(); ++i){
                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
            }
            for(int i=0; i<upper.size(); ++i){
                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
            }
            for(int i=0; i<lower.size(); ++i){
                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
            }
        #endif
        this->M5Ddag(psi, chi, chi, lower, diag, upper);
    }
    // half checkerboard operations
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
    {
        int Ls = this->Ls;
        std::vector<Coeff_t> diag = this->bee;
        std::vector<Coeff_t> upper(Ls);
        std::vector<Coeff_t> lower(Ls);
        for(int s=0; s<Ls; s++){
          upper[s] = -this->cee[s];
          lower[s] = -this->cee[s];
        }
        upper[Ls-1] = this->dm;
        lower[0]    = this->dp;
        this->M5D(psi, psi, chi, lower, diag, upper);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
    {
        int Ls = this->Ls;
        std::vector<Coeff_t> diag = this->bee;
        std::vector<Coeff_t> upper(Ls);
        std::vector<Coeff_t> lower(Ls);
        for(int s=0; s<Ls; s++){
          upper[s] = -this->cee[s];
          lower[s] = -this->cee[s];
        }
        upper[Ls-1] = this->dp;
        lower[0]    = this->dm;
        this->M5Ddag(psi, psi, chi, lower, diag, upper);
    }
    /****************************************************************************************/
    //Zolo
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
    {
        int   Ls    = this->Ls;
        int   pm    = this->pm;
        RealD mq1   = this->mq1;
        RealD mq2   = this->mq2;
        RealD mq3   = this->mq3;
        RealD shift = this->shift;
        ////////////////////////////////////////////////////////
        // Constants for the preconditioned matrix Cayley form
        ////////////////////////////////////////////////////////
        this->bs.resize(Ls);
        this->cs.resize(Ls);
        this->aee.resize(Ls);
        this->aeo.resize(Ls);
        this->bee.resize(Ls);
        this->beo.resize(Ls);
        this->cee.resize(Ls);
        this->ceo.resize(Ls);
        for(int i=0; i<Ls; ++i){
          this->bee[i] = 4.0 - this->M5 + 1.0;
          this->cee[i] = 1.0;
        }
        for(int i=0; i<Ls; ++i){
          this->aee[i] = this->cee[i];
          this->bs[i] = this->beo[i] = 1.0;
          this->cs[i] = this->ceo[i] = 0.0;
        }
        //////////////////////////////////////////
        // EOFA shift terms
        //////////////////////////////////////////
        if(pm == 1){
          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
          this->dm = mq1*this->cee[Ls-1];
        } else if(this->pm == -1) {
          this->dp = mq1*this->cee[0];
          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
        } else {
          this->dp = mq1*this->cee[0];
          this->dm = mq1*this->cee[Ls-1];
        }
        //////////////////////////////////////////
        // LDU decomposition of eeoo
        //////////////////////////////////////////
        this->dee.resize(Ls+1);
        this->lee.resize(Ls);
        this->leem.resize(Ls);
        this->uee.resize(Ls);
        this->ueem.resize(Ls);
        for(int i=0; i<Ls; ++i){
          if(i < Ls-1){
            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
            this->leem[i] = this->dm/this->bee[i];
            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
            this->dee[i] = this->bee[i];
            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
            this->ueem[i] = this->dp / this->bee[0];
            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
          } else {
            this->lee[i]  = 0.0;
            this->leem[i] = 0.0;
            this->uee[i]  = 0.0;
            this->ueem[i] = 0.0;
          }
        }
        {
          Coeff_t delta_d = 1.0 / this->bee[0];
          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
        }
        int inv = 1;
        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
    }
    // Recompute Cayley-form coefficients for different shift
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
    {
        this->shift = new_shift;
        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
    {
        int Ls = this->Ls;
        GridBase* grid = this->FermionRedBlackGrid();
        int LLs = grid->_rdimensions[0];
        if(LLs == Ls){ return; } // Not vectorised in 5th direction
        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
        for(int s=0; s<Ls; s++){
            Pplus(s,s)  = this->bee[s];
            Pminus(s,s) = this->bee[s];
        }
        for(int s=0; s<Ls-1; s++){
            Pminus(s,s+1) = -this->cee[s];
        }
        for(int s=0; s<Ls-1; s++){
            Pplus(s+1,s) = -this->cee[s+1];
        }
        Pplus (0,Ls-1) = this->dp;
        Pminus(Ls-1,0) = this->dm;
        Eigen::MatrixXcd PplusMat ;
        Eigen::MatrixXcd PminusMat;
        #if(0)
            std::cout << GridLogMessage << "Pplus:" << std::endl;
            for(int s=0; s<Ls; ++s){
                for(int ss=0; ss<Ls; ++ss){
                    std::cout << Pplus(s,ss) << "\t";
                }
                std::cout << std::endl;
            }
            std::cout << GridLogMessage << "Pminus:" << std::endl;
            for(int s=0; s<Ls; ++s){
                for(int ss=0; ss<Ls; ++ss){
                    std::cout << Pminus(s,ss) << "\t";
                }
                std::cout << std::endl;
            }
        #endif
        if(inv) {
            PplusMat  = Pplus.inverse();
            PminusMat = Pminus.inverse();
        } else {
            PplusMat  = Pplus;
            PminusMat = Pminus;
        }
        if(dag){
            PplusMat.adjointInPlace();
            PminusMat.adjointInPlace();
        }
        typedef typename SiteHalfSpinor::scalar_type scalar_type;
        const int Nsimd = Simd::Nsimd();
        Matp.resize(Ls*LLs);
        Matm.resize(Ls*LLs);
        for(int s2=0; s2<Ls; s2++){
        for(int s1=0; s1<LLs; s1++){
            int istride = LLs;
            int ostride = 1;
            Simd Vp;
            Simd Vm;
            scalar_type *sp = (scalar_type*) &Vp;
            scalar_type *sm = (scalar_type*) &Vm;
            for(int l=0; l<Nsimd; l++){
                if(switcheroo<Coeff_t>::iscomplex()) {
                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
                } else {
                    // if real
                    scalar_type tmp;
                    tmp = PplusMat (l*istride+s1*ostride,s2);
                    sp[l] = scalar_type(tmp.real(),tmp.real());
                    tmp = PminusMat(l*istride+s1*ostride,s2);
                    sm[l] = scalar_type(tmp.real(),tmp.real());
                }
            }
            Matp[LLs*s2+s1] = Vp;
            Matm[LLs*s2+s1] = Vm;
        }}
    }
    FermOpTemplateInstantiate(DomainWallEOFAFermion);
    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
 }}
--- a/lib/qcd/action/fermion/DomainWallEOFAFermion.h
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermion.h
@ -0,0 +1,115 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
 #include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  template<class Impl>
  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
  {
    public:
      INHERIT_IMPL_TYPES(Impl);
    public:
      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
      // for red-black preconditioned Shamir EOFA
      Coeff_t dm;
      Coeff_t dp;
      virtual void Instantiatable(void) {};
      // EOFA-specific operations
      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
      virtual void  Dtilde     (const FermionField& in, FermionField& out);
      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
      // override multiply
      virtual RealD M          (const FermionField& in, FermionField& out);
      virtual RealD Mdag       (const FermionField& in, FermionField& out);
      // half checkerboard operations
      virtual void  Mooee      (const FermionField& in, FermionField& out);
      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
      virtual void   M5D       (const FermionField& psi, FermionField& chi);
      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
      /////////////////////////////////////////////////////
      // Instantiate different versions depending on Impl
      /////////////////////////////////////////////////////
      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
      virtual void RefreshShiftCoefficients(RealD new_shift);
      // Constructors
      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
        RealD _M5, const ImplParams& p=ImplParams());
    protected:
      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
  };
 }}
 #define INSTANTIATE_DPERP_DWF_EOFA(A)\
 template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
 template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
 #undef  DOMAIN_WALL_EOFA_DPERP_DENSE
 #define DOMAIN_WALL_EOFA_DPERP_CACHE
 #undef  DOMAIN_WALL_EOFA_DPERP_LINALG
 #define DOMAIN_WALL_EOFA_DPERP_VEC
 #endif
--- a/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@ -0,0 +1,248 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 namespace Grid {
 namespace QCD {
    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
    // Pminus fowards
    // Pplus  backwards..
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
    {
        int Ls = this->Ls;
        GridBase* grid = psi._grid;
        assert(phi.checkerboard == psi.checkerboard);
        chi.checkerboard = psi.checkerboard;
        // Flops = 6.0*(Nc*Ns) *Ls*vol
        this->M5Dcalls++;
        this->M5Dtime -= usecond();
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
            for(int s=0; s<Ls; s++){
                auto tmp = psi._odata[0];
                if(s==0) {
                    spProj5m(tmp, psi._odata[ss+s+1]);
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
                    spProj5p(tmp, psi._odata[ss+Ls-1]);
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
                } else if(s==(Ls-1)) {
                    spProj5m(tmp, psi._odata[ss+0]);
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
                    spProj5p(tmp, psi._odata[ss+s-1]);
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
                } else {
                    spProj5m(tmp, psi._odata[ss+s+1]);
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
                    spProj5p(tmp, psi._odata[ss+s-1]);
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
                }
            }
        }
        this->M5Dtime += usecond();
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
    {
        int Ls = this->Ls;
        GridBase* grid = psi._grid;
        assert(phi.checkerboard == psi.checkerboard);
        chi.checkerboard=psi.checkerboard;
        // Flops = 6.0*(Nc*Ns) *Ls*vol
        this->M5Dcalls++;
        this->M5Dtime -= usecond();
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
            auto tmp = psi._odata[0];
            for(int s=0; s<Ls; s++){
                if(s==0) {
                    spProj5p(tmp, psi._odata[ss+s+1]);
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
                    spProj5m(tmp, psi._odata[ss+Ls-1]);
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
                } else if(s==(Ls-1)) {
                    spProj5p(tmp, psi._odata[ss+0]);
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
                    spProj5m(tmp, psi._odata[ss+s-1]);
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
                } else {
                    spProj5p(tmp, psi._odata[ss+s+1]);
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
                    spProj5m(tmp, psi._odata[ss+s-1]);
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
                }
            }
        }
        this->M5Dtime += usecond();
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
    {
        GridBase* grid = psi._grid;
        int Ls = this->Ls;
        chi.checkerboard = psi.checkerboard;
        this->MooeeInvCalls++;
        this->MooeeInvTime -= usecond();
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
            auto tmp1 = psi._odata[0];
            auto tmp2 = psi._odata[0];
            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
            // Apply (L^{\prime})^{-1}
            chi[ss] = psi[ss]; // chi[0]=psi[0]
            for(int s=1; s<Ls; s++){
                spProj5p(tmp1, chi[ss+s-1]);
                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
            }
            // L_m^{-1}
            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
                spProj5m(tmp1, chi[ss+s]);
                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
            }
            // U_m^{-1} D^{-1}
            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
                spProj5p(tmp1, chi[ss+Ls-1]);
                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
            }
            spProj5m(tmp2, chi[ss+Ls-1]);
            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
            // Apply U^{-1}
            for(int s=Ls-2; s>=0; s--){
                spProj5m(tmp1, chi[ss+s+1]);
                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
            }
        }
        this->MooeeInvTime += usecond();
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
    {
        GridBase* grid = psi._grid;
        int Ls = this->Ls;
        assert(psi.checkerboard == psi.checkerboard);
        chi.checkerboard = psi.checkerboard;
        std::vector<Coeff_t> ueec(Ls);
        std::vector<Coeff_t> deec(Ls+1);
        std::vector<Coeff_t> leec(Ls);
        std::vector<Coeff_t> ueemc(Ls);
        std::vector<Coeff_t> leemc(Ls);
        for(int s=0; s<ueec.size(); s++){
            ueec[s]  = conjugate(this->uee[s]);
            deec[s]  = conjugate(this->dee[s]);
            leec[s]  = conjugate(this->lee[s]);
            ueemc[s] = conjugate(this->ueem[s]);
            leemc[s] = conjugate(this->leem[s]);
        }
        deec[Ls] = conjugate(this->dee[Ls]);
        this->MooeeInvCalls++;
        this->MooeeInvTime -= usecond();
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
            auto tmp1 = psi._odata[0];
            auto tmp2 = psi._odata[0];
            // Apply (U^{\prime})^{-dagger}
            chi[ss] = psi[ss];
            for(int s=1; s<Ls; s++){
                spProj5m(tmp1, chi[ss+s-1]);
                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
            }
            // U_m^{-\dagger}
            for(int s=0; s<Ls-1; s++){
                spProj5p(tmp1, chi[ss+s]);
                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
            }
            // L_m^{-\dagger} D^{-dagger}
            for(int s=0; s<Ls-1; s++){
                spProj5m(tmp1, chi[ss+Ls-1]);
                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
            }
            spProj5p(tmp2, chi[ss+Ls-1]);
            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
            // Apply L^{-dagger}
            for(int s=Ls-2; s>=0; s--){
                spProj5p(tmp1, chi[ss+s+1]);
                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
            }
        }
        this->MooeeInvTime += usecond();
    }
    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
    #endif
 }}
--- a/lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
@ -0,0 +1,159 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 namespace Grid {
 namespace QCD {
    /*
    * Dense matrix versions of routines
    */
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
    {
        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
    {
        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
    {
        int Ls = this->Ls;
        int LLs = psi._grid->_rdimensions[0];
        int vol = psi._grid->oSites()/LLs;
        chi.checkerboard = psi.checkerboard;
        assert(Ls==LLs);
        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
        for(int s=0;s<Ls;s++){
            Pplus(s,s)  = this->bee[s];
            Pminus(s,s) = this->bee[s];
        }
        for(int s=0; s<Ls-1; s++){
            Pminus(s,s+1) = -this->cee[s];
        }
        for(int s=0; s<Ls-1; s++){
            Pplus(s+1,s) = -this->cee[s+1];
        }
        Pplus (0,Ls-1) = this->dp;
        Pminus(Ls-1,0) = this->dm;
        Eigen::MatrixXd PplusMat ;
        Eigen::MatrixXd PminusMat;
        if(inv) {
            PplusMat  = Pplus.inverse();
            PminusMat = Pminus.inverse();
        } else {
            PplusMat  = Pplus;
            PminusMat = Pminus;
        }
        if(dag){
            PplusMat.adjointInPlace();
            PminusMat.adjointInPlace();
        }
        // For the non-vectorised s-direction this is simple
        for(auto site=0; site<vol; site++){
            SiteSpinor     SiteChi;
            SiteHalfSpinor SitePplus;
            SiteHalfSpinor SitePminus;
            for(int s1=0; s1<Ls; s1++){
                SiteChi = zero;
                for(int s2=0; s2<Ls; s2++){
                    int lex2 = s2 + Ls*site;
                    if(PplusMat(s1,s2) != 0.0){
                        spProj5p(SitePplus,psi[lex2]);
                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
                    }
                    if(PminusMat(s1,s2) != 0.0){
                        spProj5m(SitePminus, psi[lex2]);
                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
                    }
                }
                chi[s1+Ls*site] = SiteChi*0.5;
            }
        }
    }
    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    #endif
 }}
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
@ -0,0 +1,168 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 namespace Grid {
 namespace QCD {
    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
    // Pminus fowards
    // Pplus  backwards
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
    {
        Coeff_t one(1.0);
        int Ls = this->Ls;
        for(int s=0; s<Ls; s++){
            if(s==0) {
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
            } else if (s==(Ls-1)) {
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
            } else {
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
            }
        }
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
    {
        Coeff_t one(1.0);
        int Ls = this->Ls;
        for(int s=0; s<Ls; s++){
            if(s==0) {
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
            } else if (s==(Ls-1)) {
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
            } else {
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
            }
        }
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
    {
        Coeff_t one(1.0);
        Coeff_t czero(0.0);
        chi.checkerboard = psi.checkerboard;
        int Ls = this->Ls;
        FermionField tmp(psi._grid);
        // Apply (L^{\prime})^{-1}
        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
        for(int s=1; s<Ls; s++){
            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
        }
        // L_m^{-1}
        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
        }
        // U_m^{-1} D^{-1}
        for(int s=0; s<Ls-1; s++){
            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
        }
        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
        // Apply U^{-1}
        for(int s=Ls-2; s>=0; s--){
            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
        }
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
    {
        Coeff_t one(1.0);
        Coeff_t czero(0.0);
        chi.checkerboard = psi.checkerboard;
        int Ls = this->Ls;
        FermionField tmp(psi._grid);
        // Apply (U^{\prime})^{-dagger}
        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
        for(int s=1; s<Ls; s++){
            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
        }
        // U_m^{-\dagger}
        for(int s=0; s<Ls-1; s++){
            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
        }
        // L_m^{-\dagger} D^{-dagger}
        for(int s=0; s<Ls-1; s++){
            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
        }
        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
        // Apply L^{-dagger}
        for(int s=Ls-2; s>=0; s--){
            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
        }
    }
    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
    #endif
 }}
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@ -0,0 +1,605 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 namespace Grid {
 namespace QCD {
    /*
    * Dense matrix versions of routines
    */
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
    {
        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
    {
        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
    {
        GridBase* grid = psi._grid;
        int Ls  = this->Ls;
        int LLs = grid->_rdimensions[0];
        const int nsimd = Simd::Nsimd();
        Vector<iSinglet<Simd> > u(LLs);
        Vector<iSinglet<Simd> > l(LLs);
        Vector<iSinglet<Simd> > d(LLs);
        assert(Ls/LLs == nsimd);
        assert(phi.checkerboard == psi.checkerboard);
        chi.checkerboard = psi.checkerboard;
        // just directly address via type pun
        typedef typename Simd::scalar_type scalar_type;
        scalar_type* u_p = (scalar_type*) &u[0];
        scalar_type* l_p = (scalar_type*) &l[0];
        scalar_type* d_p = (scalar_type*) &d[0];
        for(int o=0;o<LLs;o++){ // outer
        for(int i=0;i<nsimd;i++){ //inner
            int s  = o + i*LLs;
            int ss = o*nsimd + i;
            u_p[ss] = upper[s];
            l_p[ss] = lower[s];
            d_p[ss] = diag[s];
        }}
        this->M5Dcalls++;
        this->M5Dtime -= usecond();
        assert(Nc == 3);
        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
            #if 0
                alignas(64) SiteHalfSpinor hp;
                alignas(64) SiteHalfSpinor hm;
                alignas(64) SiteSpinor fp;
                alignas(64) SiteSpinor fm;
                for(int v=0; v<LLs; v++){
                    int vp = (v+1)%LLs;
                    int vm = (v+LLs-1)%LLs;
                    spProj5m(hp, psi[ss+vp]);
                    spProj5p(hm, psi[ss+vm]);
                    if (vp <= v){ rotate(hp, hp, 1); }
                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
                    hp = 0.5*hp;
                    hm = 0.5*hm;
                    spRecon5m(fp, hp);
                    spRecon5p(fm, hm);
                    chi[ss+v] = d[v]*phi[ss+v];
                    chi[ss+v] = chi[ss+v] + u[v]*fp;
                    chi[ss+v] = chi[ss+v] + l[v]*fm;
                }
            #else
                for(int v=0; v<LLs; v++){
                    vprefetch(psi[ss+v+LLs]);
                    int vp = (v==LLs-1) ? 0     : v+1;
                    int vm = (v==0)     ? LLs-1 : v-1;
                    Simd hp_00 = psi[ss+vp]()(2)(0);
                    Simd hp_01 = psi[ss+vp]()(2)(1);
                    Simd hp_02 = psi[ss+vp]()(2)(2);
                    Simd hp_10 = psi[ss+vp]()(3)(0);
                    Simd hp_11 = psi[ss+vp]()(3)(1);
                    Simd hp_12 = psi[ss+vp]()(3)(2);
                    Simd hm_00 = psi[ss+vm]()(0)(0);
                    Simd hm_01 = psi[ss+vm]()(0)(1);
                    Simd hm_02 = psi[ss+vm]()(0)(2);
                    Simd hm_10 = psi[ss+vm]()(1)(0);
                    Simd hm_11 = psi[ss+vm]()(1)(1);
                    Simd hm_12 = psi[ss+vm]()(1)(2);
                    if(vp <= v){
                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
                    }
                    if(vm >= v){
                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
                    }
                    // Can force these to real arithmetic and save 2x.
                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
                    vstream(chi[ss+v]()(0)(0), p_00);
                    vstream(chi[ss+v]()(0)(1), p_01);
                    vstream(chi[ss+v]()(0)(2), p_02);
                    vstream(chi[ss+v]()(1)(0), p_10);
                    vstream(chi[ss+v]()(1)(1), p_11);
                    vstream(chi[ss+v]()(1)(2), p_12);
                    vstream(chi[ss+v]()(2)(0), p_20);
                    vstream(chi[ss+v]()(2)(1), p_21);
                    vstream(chi[ss+v]()(2)(2), p_22);
                    vstream(chi[ss+v]()(3)(0), p_30);
                    vstream(chi[ss+v]()(3)(1), p_31);
                    vstream(chi[ss+v]()(3)(2), p_32);
                }
            #endif
        }
        this->M5Dtime += usecond();
    }
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
    {
        GridBase* grid = psi._grid;
        int Ls  = this->Ls;
        int LLs = grid->_rdimensions[0];
        int nsimd = Simd::Nsimd();
        Vector<iSinglet<Simd> > u(LLs);
        Vector<iSinglet<Simd> > l(LLs);
        Vector<iSinglet<Simd> > d(LLs);
        assert(Ls/LLs == nsimd);
        assert(phi.checkerboard == psi.checkerboard);
        chi.checkerboard = psi.checkerboard;
        // just directly address via type pun
        typedef typename Simd::scalar_type scalar_type;
        scalar_type* u_p = (scalar_type*) &u[0];
        scalar_type* l_p = (scalar_type*) &l[0];
        scalar_type* d_p = (scalar_type*) &d[0];
        for(int o=0; o<LLs; o++){ // outer
        for(int i=0; i<nsimd; i++){ //inner
            int s  = o + i*LLs;
            int ss = o*nsimd + i;
            u_p[ss] = upper[s];
            l_p[ss] = lower[s];
            d_p[ss] = diag[s];
        }}
        this->M5Dcalls++;
        this->M5Dtime -= usecond();
        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
        #if 0
            alignas(64) SiteHalfSpinor hp;
            alignas(64) SiteHalfSpinor hm;
            alignas(64) SiteSpinor fp;
            alignas(64) SiteSpinor fm;
            for(int v=0; v<LLs; v++){
                int vp = (v+1)%LLs;
                int vm = (v+LLs-1)%LLs;
                spProj5p(hp, psi[ss+vp]);
                spProj5m(hm, psi[ss+vm]);
                if(vp <= v){ rotate(hp, hp, 1); }
                if(vm >= v){ rotate(hm, hm, nsimd-1); }
                hp = hp*0.5;
                hm = hm*0.5;
                spRecon5p(fp, hp);
                spRecon5m(fm, hm);
                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
                chi[ss+v] = chi[ss+v]     +l[v]*fm;
            }
        #else
            for(int v=0; v<LLs; v++){
                vprefetch(psi[ss+v+LLs]);
                int vp = (v == LLs-1) ? 0     : v+1;
                int vm = (v == 0    ) ? LLs-1 : v-1;
                Simd hp_00 = psi[ss+vp]()(0)(0);
                Simd hp_01 = psi[ss+vp]()(0)(1);
                Simd hp_02 = psi[ss+vp]()(0)(2);
                Simd hp_10 = psi[ss+vp]()(1)(0);
                Simd hp_11 = psi[ss+vp]()(1)(1);
                Simd hp_12 = psi[ss+vp]()(1)(2);
                Simd hm_00 = psi[ss+vm]()(2)(0);
                Simd hm_01 = psi[ss+vm]()(2)(1);
                Simd hm_02 = psi[ss+vm]()(2)(2);
                Simd hm_10 = psi[ss+vm]()(3)(0);
                Simd hm_11 = psi[ss+vm]()(3)(1);
                Simd hm_12 = psi[ss+vm]()(3)(2);
                if (vp <= v){
                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
                }
                if(vm >= v){
                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
                }
                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
                vstream(chi[ss+v]()(0)(0), p_00);
                vstream(chi[ss+v]()(0)(1), p_01);
                vstream(chi[ss+v]()(0)(2), p_02);
                vstream(chi[ss+v]()(1)(0), p_10);
                vstream(chi[ss+v]()(1)(1), p_11);
                vstream(chi[ss+v]()(1)(2), p_12);
                vstream(chi[ss+v]()(2)(0), p_20);
                vstream(chi[ss+v]()(2)(1), p_21);
                vstream(chi[ss+v]()(2)(2), p_22);
                vstream(chi[ss+v]()(3)(0), p_30);
                vstream(chi[ss+v]()(3)(1), p_31);
                vstream(chi[ss+v]()(3)(2), p_32);
            }
        #endif
        }
        this->M5Dtime += usecond();
    }
    #ifdef AVX512
        #include<simd/Intel512common.h>
        #include<simd/Intel512avx.h>
        #include<simd/Intel512single.h>
    #endif
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
    {
        #ifndef AVX512
        {
            SiteHalfSpinor BcastP;
            SiteHalfSpinor BcastM;
            SiteHalfSpinor SiteChiP;
            SiteHalfSpinor SiteChiM;
            // Ls*Ls * 2 * 12 * vol flops
            for(int s1=0; s1<LLs; s1++){
                for(int s2=0; s2<LLs; s2++){
                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
                    int s = s2 + l*LLs;
                    int lex = s2 + LLs*site;
                    if( s2==0 && l==0 ){
                        SiteChiP=zero;
                        SiteChiM=zero;
                    }
                    for(int sp=0; sp<2;  sp++){
                    for(int co=0; co<Nc; co++){
                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
                    }}
                    for(int sp=0; sp<2;  sp++){
                    for(int co=0; co<Nc; co++){
                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
                    }}
                    for(int sp=0; sp<2;  sp++){
                    for(int co=0; co<Nc; co++){
                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
                    }}
                }}
                {
                    int lex = s1 + LLs*site;
                    for(int sp=0; sp<2;  sp++){
                    for(int co=0; co<Nc; co++){
                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
                    }}
                }
            }
        }
        #else
        {
            // pointers
            //  MASK_REGS;
            #define Chi_00 %%zmm1
            #define Chi_01 %%zmm2
            #define Chi_02 %%zmm3
            #define Chi_10 %%zmm4
            #define Chi_11 %%zmm5
            #define Chi_12 %%zmm6
            #define Chi_20 %%zmm7
            #define Chi_21 %%zmm8
            #define Chi_22 %%zmm9
            #define Chi_30 %%zmm10
            #define Chi_31 %%zmm11
            #define Chi_32 %%zmm12
            #define BCAST0  %%zmm13
            #define BCAST1  %%zmm14
            #define BCAST2  %%zmm15
            #define BCAST3  %%zmm16
            #define BCAST4  %%zmm17
            #define BCAST5  %%zmm18
            #define BCAST6  %%zmm19
            #define BCAST7  %%zmm20
            #define BCAST8  %%zmm21
            #define BCAST9  %%zmm22
            #define BCAST10 %%zmm23
            #define BCAST11 %%zmm24
            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
            for(int s1=0; s1<LLs; s1++){
                for(int s2=0; s2<LLs; s2++){
                    int lex = s2 + LLs*site;
                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
                    uint64_t a2 = (uint64_t) &psi[lex];
                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
                        if((s2+l)==0) {
                            asm(
                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
                                    VBCASTCDUP(0,%2,BCAST0)
                                    VBCASTCDUP(1,%2,BCAST1)
                                    VBCASTCDUP(2,%2,BCAST2)
                                    VBCASTCDUP(3,%2,BCAST3)
                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
                                    VMULMEM(0,%1,BCAST8,Chi_22)
                                    VMULMEM(0,%1,BCAST9,Chi_30)
                                    VMULMEM(0,%1,BCAST10,Chi_31)
                                    VMULMEM(0,%1,BCAST11,Chi_32)
                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
                        } else {
                            asm(
                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
                        }
                        a0 = a0 + incr;
                        a1 = a1 + incr;
                        a2 = a2 + sizeof(Simd::scalar_type);
                    }
                }
                {
                  int lexa = s1+LLs*site;
                  asm (
                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
                }
            }
        }
        #undef Chi_00
        #undef Chi_01
        #undef Chi_02
        #undef Chi_10
        #undef Chi_11
        #undef Chi_12
        #undef Chi_20
        #undef Chi_21
        #undef Chi_22
        #undef Chi_30
        #undef Chi_31
        #undef Chi_32
        #undef BCAST0
        #undef BCAST1
        #undef BCAST2
        #undef BCAST3
        #undef BCAST4
        #undef BCAST5
        #undef BCAST6
        #undef BCAST7
        #undef BCAST8
        #undef BCAST9
        #undef BCAST10
        #undef BCAST11
        #endif
    };
    // Z-mobius version
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
    {
        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
        exit(-1);
    };
    template<class Impl>
    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
    {
        int Ls  = this->Ls;
        int LLs = psi._grid->_rdimensions[0];
        int vol = psi._grid->oSites()/LLs;
        chi.checkerboard = psi.checkerboard;
        Vector<iSinglet<Simd> > Matp;
        Vector<iSinglet<Simd> > Matm;
        Vector<iSinglet<Simd> > *_Matp;
        Vector<iSinglet<Simd> > *_Matm;
        //  MooeeInternalCompute(dag,inv,Matp,Matm);
        if(inv && dag){
            _Matp = &this->MatpInvDag;
            _Matm = &this->MatmInvDag;
        }
        if(inv && (!dag)){
            _Matp = &this->MatpInv;
            _Matm = &this->MatmInv;
        }
        if(!inv){
            MooeeInternalCompute(dag, inv, Matp, Matm);
            _Matp = &Matp;
            _Matm = &Matm;
        }
        assert(_Matp->size() == Ls*LLs);
        this->MooeeInvCalls++;
        this->MooeeInvTime -= usecond();
        if(switcheroo<Coeff_t>::iscomplex()){
            parallel_for(auto site=0; site<vol; site++){
                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
            }
        } else {
            parallel_for(auto site=0; site<vol; site++){
                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
            }
        }
        this->MooeeInvTime += usecond();
    }
    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    #endif
 }}
--- a/lib/qcd/action/fermion/Fermion.h
+++ b/lib/qcd/action/fermion/Fermion.h
@ -38,6 +38,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // - ContinuedFractionFermion5D.cc
 // - WilsonFermion.cc
 // - WilsonKernels.cc
 // - DomainWallEOFAFermion.cc
 // - MobiusEOFAFermion.cc
 //
 // The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
 // for EVERY .cc file. This define centralises the list and restores global push of impl cases
@ -55,8 +57,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
-#include <Grid/qcd/action/fermion/DomainWallFermion.h>
+#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 #include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
@ -113,6 +116,14 @@ typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
 typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
 typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@ -121,6 +132,14 @@ typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
 typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
 typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
@ -138,6 +157,14 @@ typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
 typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
 typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
 typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
 typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
 typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
 typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
 typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
 typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
 typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
 typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
 typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
@ -146,6 +173,14 @@ typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
 typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
 typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
 typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
 typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
 typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
 typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
 typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
 typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
 typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
 typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
 typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
@ -206,6 +241,14 @@ typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
 typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
 typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
@ -222,6 +265,14 @@ typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
@ -237,4 +288,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
  }}
 ////////////////////
 // Scalar QED actions
 // TODO: this needs to move to another header after rename to Fermion.h
 ////////////////////
 #include <Grid/qcd/action/scalar/Scalar.h>
 #include <Grid/qcd/action/gauge/Photon.h>
 #endif
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -538,6 +538,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 }
 template <class ref>
 inline void loadLinkElement(Simd &reg, ref &memory) {
   reg = memory;
 }
 inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 {
   conformable(Uds._grid,GaugeGrid);
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
  Compressor compressor;
  int LLs = in._grid->_rdimensions[0];
  st.HaloExchange(in,compressor);
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime += usecond();
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
    }
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
  std::vector<int> latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _FourDimGrid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime    = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -55,6 +55,16 @@ namespace QCD {
      FermionField _tmp;
      FermionField &tmp(void) { return _tmp; }
      ////////////////////////////////////////
      // Performance monitoring
      ////////////////////////////////////////
      void Report(void);
      void ZeroCounters(void);
      double DhopTotalTime;
      double DhopCalls;
      double DhopCommTime;
      double DhopComputeTime;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
--- a/lib/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermion.cc
@ -0,0 +1,502 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  template<class Impl>
    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
      GaugeField            &_Umu,
      GridCartesian         &FiveDimGrid,
      GridRedBlackCartesian &FiveDimRedBlackGrid,
      GridCartesian         &FourDimGrid,
      GridRedBlackCartesian &FourDimRedBlackGrid,
      RealD _mq1, RealD _mq2, RealD _mq3,
      RealD _shift, int _pm, RealD _M5,
      RealD _b, RealD _c, const ImplParams &p) :
    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
        _shift, _pm, _M5, _b, _c, p)
    {
      int Ls = this->Ls;
      RealD eps = 1.0;
      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
      assert(zdata->n == this->Ls);
      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
        ",c=" << _c << ") with Ls=" << Ls << std::endl;
      this->SetCoefficientsTanh(zdata, _b, _c);
      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
        ",pm=" << _pm << ")" << std::endl;
      Approx::zolotarev_free(zdata);
      if(_shift != 0.0){
        SetCoefficientsPrecondShiftOps();
      } else {
        Mooee_shift.resize(Ls, 0.0);
        MooeeInv_shift_lc.resize(Ls, 0.0);
        MooeeInv_shift_norm.resize(Ls, 0.0);
        MooeeInvDag_shift_lc.resize(Ls, 0.0);
        MooeeInvDag_shift_norm.resize(Ls, 0.0);
      }
    }
    /***************************************************************
    /* Additional EOFA operators only called outside the inverter.
    /* Since speed is not essential, simple axpby-style
    /* implementations should be fine.
    /***************************************************************/
    template<class Impl>
    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
    {
      int Ls = this->Ls;
      RealD alpha = this->alpha;
      Din = zero;
      if((sign == 1) && (dag == 0)) { // \Omega_{+}
        for(int s=0; s<Ls; ++s){
          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
        }
      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
        for(int s=0; s<Ls; ++s){
          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
        }
      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
        for(int sp=0; sp<Ls; ++sp){
          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
        }
      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
        for(int sp=0; sp<Ls; ++sp){
          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
        }
      }
    }
    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
    template<class Impl>
    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
    {
      int Ls    = this->Ls;
      RealD b   = 0.5 * ( 1.0 + this->alpha );
      RealD c   = 0.5 * ( 1.0 - this->alpha );
      RealD mq1 = this->mq1;
      for(int s=0; s<Ls; ++s){
        if(s == 0) {
          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
        } else if(s == (Ls-1)) {
          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
        } else {
          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
        }
      }
    }
    template<class Impl>
    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      RealD m = this->mq1;
      RealD c = 0.5 * this->alpha;
      RealD d = 0.5;
      RealD DtInv_p(0.0), DtInv_m(0.0);
      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
      FermionField tmp(this->FermionGrid());
      for(int s=0; s<Ls; ++s){
      for(int sp=0; sp<Ls; ++sp){
        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
        if(sp == 0){
          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
        } else {
          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
        }
      }}
    }
    /*****************************************************************************************************/
    template<class Impl>
    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      FermionField Din(psi._grid);
      this->Meooe5D(psi, Din);
      this->DW(Din, chi, DaggerNo);
      axpby(chi, 1.0, 1.0, chi, psi);
      this->M5D(psi, chi);
      return(norm2(chi));
    }
    template<class Impl>
    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      FermionField Din(psi._grid);
      this->DW(psi, Din, DaggerYes);
      this->MeooeDag5D(Din, chi);
      this->M5Ddag(psi, chi);
      axpby(chi, 1.0, 1.0, chi, psi);
      return(norm2(chi));
    }
    /********************************************************************
    /* Performance critical fermion operators called inside the inverter
    /********************************************************************/
    template<class Impl>
    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      std::vector<Coeff_t> diag(Ls,1.0);
      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
      // no shift term
      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
      // fused M + shift operation
      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
    }
    template<class Impl>
    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      std::vector<Coeff_t> diag(Ls,1.0);
      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
      // no shift term
      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
      // fused M + shift operation
      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
    }
    // half checkerboard operations
    template<class Impl>
    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      // coefficients of Mooee
      std::vector<Coeff_t> diag = this->bee;
      std::vector<Coeff_t> upper(Ls);
      std::vector<Coeff_t> lower(Ls);
      for(int s=0; s<Ls; s++){
        upper[s] = -this->cee[s];
        lower[s] = -this->cee[s];
      }
      upper[Ls-1] *= -this->mq1;
      lower[0]    *= -this->mq1;
      // no shift term
      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
      // fused M + shift operation
      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
    }
    template<class Impl>
    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
    {
      int Ls = this->Ls;
      // coefficients of MooeeDag
      std::vector<Coeff_t> diag = this->bee;
      std::vector<Coeff_t> upper(Ls);
      std::vector<Coeff_t> lower(Ls);
      for(int s=0; s<Ls; s++){
        if(s==0) {
          upper[s] = -this->cee[s+1];
          lower[s] = this->mq1*this->cee[Ls-1];
        } else if(s==(Ls-1)) {
          upper[s] = this->mq1*this->cee[0];
          lower[s] = -this->cee[s-1];
        } else {
          upper[s] = -this->cee[s+1];
          lower[s] = -this->cee[s-1];
        }
      }
      // no shift term
      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
      // fused M + shift operation
      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
    }
    /****************************************************************************************/
    // Computes coefficients for applying Cayley preconditioned shift operators
    //  (Mooee + \Delta) --> Mooee_shift
    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
    // For the latter two cases, the operation takes the form
    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
    template<class Impl>
    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
    {
      int   Ls    = this->Ls;
      int   pm    = this->pm;
      RealD alpha = this->alpha;
      RealD k     = this->k;
      RealD mq1   = this->mq1;
      RealD shift = this->shift;
      // Initialize
      Mooee_shift.resize(Ls);
      MooeeInv_shift_lc.resize(Ls);
      MooeeInv_shift_norm.resize(Ls);
      MooeeInvDag_shift_lc.resize(Ls);
      MooeeInvDag_shift_norm.resize(Ls);
      // Construct Mooee_shift
      int idx(0);
      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
      for(int s=0; s<Ls; ++s){
        idx = (pm == 1) ? (s) : (Ls-1-s);
        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
      }
      // Tridiagonal solve for MooeeInvDag_shift_lc
      {
        Coeff_t m(0.0);
        std::vector<Coeff_t> d = Mooee_shift;
        std::vector<Coeff_t> u(Ls,0.0);
        std::vector<Coeff_t> y(Ls,0.0);
        std::vector<Coeff_t> q(Ls,0.0);
        if(pm == 1){ u[0] = 1.0; }
        else{ u[Ls-1] = 1.0; }
        // Tridiagonal matrix algorithm + Sherman-Morrison formula
        //
        // We solve
        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
        // where Mooee' is the tridiagonal part of Mooee_{+}, and
        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
        // so that the outer-product u \otimes v gives the (0,Ls-1)
        // entry of Mooee_{+}.
        //
        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
        // and then construct the solution to the original system
        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
        if(pm == 1){
          for(int s=1; s<Ls; ++s){
            m = -this->cee[s] / this->bee[s-1];
            d[s] -= m*d[s-1];
            u[s] -= m*u[s-1];
          }
        }
        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
        for(int s=Ls-2; s>=0; --s){
          if(pm == 1){
            y[s] = d[s] / this->bee[s];
            q[s] = u[s] / this->bee[s];
          } else {
            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
          }
        }
        // Construct MooeeInvDag_shift_lc
        for(int s=0; s<Ls; ++s){
          if(pm == 1){
            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
          } else {
            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
          }
        }
        // Compute remaining coefficients
        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
        for(int s=0; s<Ls; ++s){
          // MooeeInv_shift_lc
          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
          // MooeeInv_shift_norm
          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
          // MooeeInvDag_shift_norm
          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
        }
      }
    }
    // Recompute coefficients for a different value of shift constant
    template<class Impl>
    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
    {
      this->shift = new_shift;
      if(new_shift != 0.0){
        SetCoefficientsPrecondShiftOps();
      } else {
        int Ls = this->Ls;
        Mooee_shift.resize(Ls,0.0);
        MooeeInv_shift_lc.resize(Ls,0.0);
        MooeeInv_shift_norm.resize(Ls,0.0);
        MooeeInvDag_shift_lc.resize(Ls,0.0);
        MooeeInvDag_shift_norm.resize(Ls,0.0);
      }
    }
    template<class Impl>
    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
    {
      int Ls = this->Ls;
      GridBase* grid = this->FermionRedBlackGrid();
      int LLs = grid->_rdimensions[0];
      if(LLs == Ls){ return; } // Not vectorised in 5th direction
      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
      for(int s=0; s<Ls; s++){
        Pplus(s,s)  = this->bee[s];
        Pminus(s,s) = this->bee[s];
      }
      for(int s=0; s<Ls-1; s++){
        Pminus(s,s+1) = -this->cee[s];
        Pplus(s+1,s) = -this->cee[s+1];
      }
      Pplus (0,Ls-1) = this->mq1*this->cee[0];
      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
      if(this->shift != 0.0){
        RealD c = 0.5 * this->alpha;
        RealD d = 0.5;
        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
        if(this->pm == 1) {
          for(int s=0; s<Ls; ++s){
            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
          }
        } else {
          for(int s=0; s<Ls; ++s){
            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
          }
        }
      }
      Eigen::MatrixXcd PplusMat ;
      Eigen::MatrixXcd PminusMat;
      if(inv) {
        PplusMat  = Pplus.inverse();
        PminusMat = Pminus.inverse();
      } else {
        PplusMat  = Pplus;
        PminusMat = Pminus;
      }
      if(dag){
        PplusMat.adjointInPlace();
        PminusMat.adjointInPlace();
      }
      typedef typename SiteHalfSpinor::scalar_type scalar_type;
      const int Nsimd = Simd::Nsimd();
      Matp.resize(Ls*LLs);
      Matm.resize(Ls*LLs);
      for(int s2=0; s2<Ls; s2++){
      for(int s1=0; s1<LLs; s1++){
        int istride = LLs;
        int ostride = 1;
        Simd Vp;
        Simd Vm;
        scalar_type *sp = (scalar_type*) &Vp;
        scalar_type *sm = (scalar_type*) &Vm;
        for(int l=0; l<Nsimd; l++){
          if(switcheroo<Coeff_t>::iscomplex()) {
            sp[l] = PplusMat (l*istride+s1*ostride,s2);
            sm[l] = PminusMat(l*istride+s1*ostride,s2);
          } else {
            // if real
            scalar_type tmp;
            tmp = PplusMat (l*istride+s1*ostride,s2);
            sp[l] = scalar_type(tmp.real(),tmp.real());
            tmp = PminusMat(l*istride+s1*ostride,s2);
            sm[l] = scalar_type(tmp.real(),tmp.real());
          }
        }
        Matp[LLs*s2+s1] = Vp;
        Matm[LLs*s2+s1] = Vm;
      }}
  }
  FermOpTemplateInstantiate(MobiusEOFAFermion);
  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
 }}
--- a/lib/qcd/action/fermion/MobiusEOFAFermion.h
+++ b/lib/qcd/action/fermion/MobiusEOFAFermion.h
@ -0,0 +1,133 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
 #define  GRID_QCD_MOBIUS_EOFA_FERMION_H
 #include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  template<class Impl>
  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
  {
    public:
      INHERIT_IMPL_TYPES(Impl);
    public:
      // Shift operator coefficients for red-black preconditioned Mobius EOFA
      std::vector<Coeff_t> Mooee_shift;
      std::vector<Coeff_t> MooeeInv_shift_lc;
      std::vector<Coeff_t> MooeeInv_shift_norm;
      std::vector<Coeff_t> MooeeInvDag_shift_lc;
      std::vector<Coeff_t> MooeeInvDag_shift_norm;
      virtual void Instantiatable(void) {};
      // EOFA-specific operations
      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
      virtual void  Dtilde           (const FermionField& in, FermionField& out);
      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
      // override multiply
      virtual RealD M                (const FermionField& in, FermionField& out);
      virtual RealD Mdag             (const FermionField& in, FermionField& out);
      // half checkerboard operations
      virtual void  Mooee            (const FermionField& in, FermionField& out);
      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
      virtual void   M5D             (const FermionField& psi, FermionField& chi);
      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
      /////////////////////////////////////////////////////
      // Instantiate different versions depending on Impl
      /////////////////////////////////////////////////////
      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
        std::vector<Coeff_t>& shift_coeffs);
      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
        std::vector<Coeff_t>& shift_coeffs);
      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
      virtual void RefreshShiftCoefficients(RealD new_shift);
      // Constructors
      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
    protected:
      void SetCoefficientsPrecondShiftOps(void);
  };
 }}
 #define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
 template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
 template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
 template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
 template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
 template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
 template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
 #undef  MOBIUS_EOFA_DPERP_DENSE
 #define MOBIUS_EOFA_DPERP_CACHE
 #undef  MOBIUS_EOFA_DPERP_LINALG
 #define MOBIUS_EOFA_DPERP_VEC
 #endif
--- a/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
@ -0,0 +1,429 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
  {
    int Ls = this->Ls;
    GridBase *grid = psi._grid;
    assert(phi.checkerboard == psi.checkerboard);
    chi.checkerboard = psi.checkerboard;
    // Flops = 6.0*(Nc*Ns) *Ls*vol
    this->M5Dcalls++;
    this->M5Dtime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      for(int s=0; s<Ls; s++){
        auto tmp = psi._odata[0];
        if(s==0){
          spProj5m(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5p(tmp, psi._odata[ss+Ls-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else if(s==(Ls-1)) {
          spProj5m(tmp, psi._odata[ss+0]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5p(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else {
          spProj5m(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5p(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        }
      }
    }
    this->M5Dtime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
    std::vector<Coeff_t> &shift_coeffs)
  {
    int Ls = this->Ls;
    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
    GridBase *grid = psi._grid;
    assert(phi.checkerboard == psi.checkerboard);
    chi.checkerboard = psi.checkerboard;
    // Flops = 6.0*(Nc*Ns) *Ls*vol
    this->M5Dcalls++;
    this->M5Dtime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      for(int s=0; s<Ls; s++){
        auto tmp = psi._odata[0];
        if(s==0){
          spProj5m(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5p(tmp, psi._odata[ss+Ls-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else if(s==(Ls-1)) {
          spProj5m(tmp, psi._odata[ss+0]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5p(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else {
          spProj5m(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5p(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        }
        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
      }
    }
    this->M5Dtime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
  {
    int Ls = this->Ls;
    GridBase *grid = psi._grid;
    assert(phi.checkerboard == psi.checkerboard);
    chi.checkerboard = psi.checkerboard;
    // Flops = 6.0*(Nc*Ns) *Ls*vol
    this->M5Dcalls++;
    this->M5Dtime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      auto tmp = psi._odata[0];
      for(int s=0; s<Ls; s++){
        if(s==0) {
          spProj5p(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5m(tmp, psi._odata[ss+Ls-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else if(s==(Ls-1)) {
          spProj5p(tmp, psi._odata[ss+0]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5m(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else {
          spProj5p(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5m(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        }
      }
    }
    this->M5Dtime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
    std::vector<Coeff_t> &shift_coeffs)
  {
    int Ls = this->Ls;
    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
    GridBase *grid = psi._grid;
    assert(phi.checkerboard == psi.checkerboard);
    chi.checkerboard = psi.checkerboard;
    // Flops = 6.0*(Nc*Ns) *Ls*vol
    this->M5Dcalls++;
    this->M5Dtime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      chi[ss+Ls-1] = zero;
      auto tmp = psi._odata[0];
      for(int s=0; s<Ls; s++){
        if(s==0) {
          spProj5p(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5m(tmp, psi._odata[ss+Ls-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else if(s==(Ls-1)) {
          spProj5p(tmp, psi._odata[ss+0]);
          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5m(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        } else {
          spProj5p(tmp, psi._odata[ss+s+1]);
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
          spProj5m(tmp, psi._odata[ss+s-1]);
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
        }
        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
        else{ spProj5m(tmp, psi._odata[ss+s]); }
        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
      }
    }
    this->M5Dtime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
  {
    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
    GridBase *grid = psi._grid;
    int Ls = this->Ls;
    chi.checkerboard = psi.checkerboard;
    this->MooeeInvCalls++;
    this->MooeeInvTime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      auto tmp = psi._odata[0];
      // Apply (L^{\prime})^{-1}
      chi[ss] = psi[ss]; // chi[0]=psi[0]
      for(int s=1; s<Ls; s++){
        spProj5p(tmp, chi[ss+s-1]);
        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
      }
      // L_m^{-1}
      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
        spProj5m(tmp, chi[ss+s]);
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
      }
      // U_m^{-1} D^{-1}
      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
        spProj5p(tmp, chi[ss+Ls-1]);
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
      }
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
      // Apply U^{-1}
      for(int s=Ls-2; s>=0; s--){
        spProj5m(tmp, chi[ss+s+1]);
        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
      }
    }
    this->MooeeInvTime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
  {
    GridBase *grid = psi._grid;
    int Ls = this->Ls;
    chi.checkerboard = psi.checkerboard;
    this->MooeeInvCalls++;
    this->MooeeInvTime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      auto tmp1        = psi._odata[0];
      auto tmp2        = psi._odata[0];
      auto tmp2_spProj = psi._odata[0];
      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
      chi[ss] = psi[ss]; // chi[0]=psi[0]
      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
      for(int s=1; s<Ls; s++){
        spProj5p(tmp1, chi[ss+s-1]);
        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
      }
      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
      else{ spProj5m(tmp2_spProj, tmp2); }
      // L_m^{-1}
      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
        spProj5m(tmp1, chi[ss+s]);
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
      }
      // U_m^{-1} D^{-1}
      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
        spProj5p(tmp1, chi[ss+Ls-1]);
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
      }
      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
      spProj5m(tmp1, chi[ss+Ls-1]);
      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
      // Apply U^{-1} and add shift term
      for(int s=Ls-2; s>=0; s--){
        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
        spProj5m(tmp1, chi[ss+s]);
        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
      }
    }
    this->MooeeInvTime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
  {
    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
    GridBase *grid = psi._grid;
    int Ls = this->Ls;
    chi.checkerboard = psi.checkerboard;
    this->MooeeInvCalls++;
    this->MooeeInvTime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      auto tmp = psi._odata[0];
      // Apply (U^{\prime})^{-dag}
      chi[ss] = psi[ss];
      for(int s=1; s<Ls; s++){
        spProj5m(tmp, chi[ss+s-1]);
        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
      }
      // U_m^{-\dag}
      for(int s=0; s<Ls-1; s++){
        spProj5p(tmp, chi[ss+s]);
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
      }
      // L_m^{-\dag} D^{-dag}
      for(int s=0; s<Ls-1; s++){
        spProj5m(tmp, chi[ss+Ls-1]);
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
      }
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
      // Apply L^{-dag}
      for(int s=Ls-2; s>=0; s--){
        spProj5p(tmp, chi[ss+s+1]);
        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
      }
    }
    this->MooeeInvTime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
  {
    GridBase *grid = psi._grid;
    int Ls = this->Ls;
    chi.checkerboard = psi.checkerboard;
    this->MooeeInvCalls++;
    this->MooeeInvTime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
      auto tmp1        = psi._odata[0];
      auto tmp2        = psi._odata[0];
      auto tmp2_spProj = psi._odata[0];
      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
      chi[ss] = psi[ss];
      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
      for(int s=1; s<Ls; s++){
        spProj5m(tmp1, chi[ss+s-1]);
        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
      }
      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
      else{ spProj5m(tmp2_spProj, tmp2); }
      // U_m^{-\dag}
      for(int s=0; s<Ls-1; s++){
        spProj5p(tmp1, chi[ss+s]);
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
      }
      // L_m^{-\dag} D^{-dag}
      for(int s=0; s<Ls-1; s++){
        spProj5m(tmp1, chi[ss+Ls-1]);
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
      }
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
      spProj5p(tmp1, chi[ss+Ls-1]);
      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
      // Apply L^{-dag}
      for(int s=Ls-2; s>=0; s--){
        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
        spProj5p(tmp1, chi[ss+s]);
        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
      }
    }
    this->MooeeInvTime += usecond();
  }
  #ifdef MOBIUS_EOFA_DPERP_CACHE
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
  #endif
 }}
--- a/lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
@ -0,0 +1,184 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  /*
  * Dense matrix versions of routines
  */
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
  {
    int Ls = this->Ls;
    int LLs = psi._grid->_rdimensions[0];
    int vol = psi._grid->oSites()/LLs;
    int pm      = this->pm;
    RealD shift = this->shift;
    RealD alpha = this->alpha;
    RealD k     = this->k;
    RealD mq1   = this->mq1;
    chi.checkerboard = psi.checkerboard;
    assert(Ls==LLs);
    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
    for(int s=0;s<Ls;s++){
        Pplus(s,s)  = this->bee[s];
        Pminus(s,s) = this->bee[s];
    }
    for(int s=0; s<Ls-1; s++){
        Pminus(s,s+1) = -this->cee[s];
    }
    for(int s=0; s<Ls-1; s++){
        Pplus(s+1,s) = -this->cee[s+1];
    }
    Pplus (0,Ls-1) = mq1*this->cee[0];
    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
    if(shift != 0.0){
      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
      for(int s=0; s<Ls; ++s){
        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
      }
    }
    Eigen::MatrixXd PplusMat ;
    Eigen::MatrixXd PminusMat;
    if(inv){
      PplusMat  = Pplus.inverse();
      PminusMat = Pminus.inverse();
    } else {
      PplusMat  = Pplus;
      PminusMat = Pminus;
    }
    if(dag){
      PplusMat.adjointInPlace();
      PminusMat.adjointInPlace();
    }
    // For the non-vectorised s-direction this is simple
    for(auto site=0; site<vol; site++){
        SiteSpinor     SiteChi;
        SiteHalfSpinor SitePplus;
        SiteHalfSpinor SitePminus;
        for(int s1=0; s1<Ls; s1++){
            SiteChi = zero;
            for(int s2=0; s2<Ls; s2++){
                int lex2 = s2 + Ls*site;
                if(PplusMat(s1,s2) != 0.0){
                    spProj5p(SitePplus,psi[lex2]);
                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
                }
                if(PminusMat(s1,s2) != 0.0){
                    spProj5m(SitePminus, psi[lex2]);
                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
                }
            }
            chi[s1+Ls*site] = SiteChi*0.5;
        }
    }
  }
  #ifdef MOBIUS_EOFA_DPERP_DENSE
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
  #endif
 }}
--- a/lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
@ -0,0 +1,290 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
  // Pminus fowards
  // Pplus  backwards
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
  {
    Coeff_t one(1.0);
    int Ls = this->Ls;
    for(int s=0; s<Ls; s++){
      if(s==0) {
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
      } else if (s==(Ls-1)) {
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
      } else {
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
      }
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
    std::vector<Coeff_t>& shift_coeffs)
  {
    Coeff_t one(1.0);
    int Ls = this->Ls;
    for(int s=0; s<Ls; s++){
      if(s==0) {
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
      } else if (s==(Ls-1)) {
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
      } else {
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
      }
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
  {
    Coeff_t one(1.0);
    int Ls = this->Ls;
    for(int s=0; s<Ls; s++){
      if(s==0) {
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
      } else if (s==(Ls-1)) {
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
      } else {
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
      }
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
    std::vector<Coeff_t>& shift_coeffs)
  {
    Coeff_t one(1.0);
    int Ls = this->Ls;
    for(int s=0; s<Ls; s++){
      if(s==0) {
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
      } else if (s==(Ls-1)) {
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
      } else {
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
      }
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
  {
    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
    Coeff_t one(1.0);
    Coeff_t czero(0.0);
    chi.checkerboard = psi.checkerboard;
    int Ls = this->Ls;
    // Apply (L^{\prime})^{-1}
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
    for(int s=1; s<Ls; s++){
      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
    }
    // L_m^{-1}
    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
    }
    // U_m^{-1} D^{-1}
    for(int s=0; s<Ls-1; s++){
      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
    }
    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
    // Apply U^{-1}
    for(int s=Ls-2; s>=0; s--){
      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
  {
    Coeff_t one(1.0);
    Coeff_t czero(0.0);
    chi.checkerboard = psi.checkerboard;
    int Ls = this->Ls;
    FermionField tmp(psi._grid);
    // Apply (L^{\prime})^{-1}
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
    for(int s=1; s<Ls; s++){
      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
    }
    // L_m^{-1}
    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
    }
    // U_m^{-1} D^{-1}
    for(int s=0; s<Ls-1; s++){
      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
    }
    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
    // Apply U^{-1} and add shift term
    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
    for(int s=Ls-2; s>=0; s--){
      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
  {
    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
    Coeff_t one(1.0);
    Coeff_t czero(0.0);
    chi.checkerboard = psi.checkerboard;
    int Ls = this->Ls;
    // Apply (U^{\prime})^{-dagger}
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
    for(int s=1; s<Ls; s++){
      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
    }
    // U_m^{-\dagger}
    for(int s=0; s<Ls-1; s++){
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
    }
    // L_m^{-\dagger} D^{-dagger}
    for(int s=0; s<Ls-1; s++){
      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
    }
    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
    // Apply L^{-dagger}
    for(int s=Ls-2; s>=0; s--){
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
    }
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
  {
    Coeff_t one(1.0);
    Coeff_t czero(0.0);
    chi.checkerboard = psi.checkerboard;
    int Ls = this->Ls;
    FermionField tmp(psi._grid);
    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
    for(int s=1; s<Ls; s++){
      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
    }
    // U_m^{-\dagger}
    for(int s=0; s<Ls-1; s++){
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
    }
    // L_m^{-\dagger} D^{-dagger}
    for(int s=0; s<Ls-1; s++){
      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
    }
    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
    // Apply L^{-dagger} and add shift
    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
    for(int s=Ls-2; s>=0; s--){
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
    }
  }
  #ifdef MOBIUS_EOFA_DPERP_LINALG
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
  #endif
 }}
--- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
@ -0,0 +1,983 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 namespace Grid {
 namespace QCD {
  /*
  * Dense matrix versions of routines
  */
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
  {
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
  {
    GridBase* grid  = psi._grid;
    int Ls          = this->Ls;
    int LLs         = grid->_rdimensions[0];
    const int nsimd = Simd::Nsimd();
    Vector<iSinglet<Simd>> u(LLs);
    Vector<iSinglet<Simd>> l(LLs);
    Vector<iSinglet<Simd>> d(LLs);
    assert(Ls/LLs == nsimd);
    assert(phi.checkerboard == psi.checkerboard);
    chi.checkerboard = psi.checkerboard;
    // just directly address via type pun
    typedef typename Simd::scalar_type scalar_type;
    scalar_type* u_p = (scalar_type*) &u[0];
    scalar_type* l_p = (scalar_type*) &l[0];
    scalar_type* d_p = (scalar_type*) &d[0];
    for(int o=0; o<LLs; o++){ // outer
    for(int i=0; i<nsimd; i++){ //inner
      int s   = o + i*LLs;
      int ss  = o*nsimd + i;
      u_p[ss] = upper[s];
      l_p[ss] = lower[s];
      d_p[ss] = diag[s];
    }}
    this->M5Dcalls++;
    this->M5Dtime -= usecond();
    assert(Nc == 3);
    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
      #if 0
        alignas(64) SiteHalfSpinor hp;
        alignas(64) SiteHalfSpinor hm;
        alignas(64) SiteSpinor fp;
        alignas(64) SiteSpinor fm;
        for(int v=0; v<LLs; v++){
          int vp = (v+1)%LLs;
          int vm = (v+LLs-1)%LLs;
          spProj5m(hp, psi[ss+vp]);
          spProj5p(hm, psi[ss+vm]);
          if (vp <= v){ rotate(hp, hp, 1); }
          if (vm >= v){ rotate(hm, hm, nsimd-1); }
          hp = 0.5*hp;
          hm = 0.5*hm;
          spRecon5m(fp, hp);
          spRecon5p(fm, hm);
          chi[ss+v] = d[v]*phi[ss+v];
          chi[ss+v] = chi[ss+v] + u[v]*fp;
          chi[ss+v] = chi[ss+v] + l[v]*fm;
        }
      #else
        for(int v=0; v<LLs; v++){
          vprefetch(psi[ss+v+LLs]);
          int vp = (v == LLs-1) ? 0     : v+1;
          int vm = (v == 0)     ? LLs-1 : v-1;
          Simd hp_00 = psi[ss+vp]()(2)(0);
          Simd hp_01 = psi[ss+vp]()(2)(1);
          Simd hp_02 = psi[ss+vp]()(2)(2);
          Simd hp_10 = psi[ss+vp]()(3)(0);
          Simd hp_11 = psi[ss+vp]()(3)(1);
          Simd hp_12 = psi[ss+vp]()(3)(2);
          Simd hm_00 = psi[ss+vm]()(0)(0);
          Simd hm_01 = psi[ss+vm]()(0)(1);
          Simd hm_02 = psi[ss+vm]()(0)(2);
          Simd hm_10 = psi[ss+vm]()(1)(0);
          Simd hm_11 = psi[ss+vm]()(1)(1);
          Simd hm_12 = psi[ss+vm]()(1)(2);
          if(vp <= v){
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
          }
          if(vm >= v){
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
          }
          // Can force these to real arithmetic and save 2x.
          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
          vstream(chi[ss+v]()(0)(0), p_00);
          vstream(chi[ss+v]()(0)(1), p_01);
          vstream(chi[ss+v]()(0)(2), p_02);
          vstream(chi[ss+v]()(1)(0), p_10);
          vstream(chi[ss+v]()(1)(1), p_11);
          vstream(chi[ss+v]()(1)(2), p_12);
          vstream(chi[ss+v]()(2)(0), p_20);
          vstream(chi[ss+v]()(2)(1), p_21);
          vstream(chi[ss+v]()(2)(2), p_22);
          vstream(chi[ss+v]()(3)(0), p_30);
          vstream(chi[ss+v]()(3)(1), p_31);
          vstream(chi[ss+v]()(3)(2), p_32);
        }
      #endif
    }
    this->M5Dtime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
    std::vector<Coeff_t>& shift_coeffs)
  {
    #if 0
      this->M5D(psi, phi, chi, lower, diag, upper);
      // FIXME: possible gain from vectorizing shift operation as well?
      Coeff_t one(1.0);
      int Ls = this->Ls;
      for(int s=0; s<Ls; s++){
        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
      }
    #else
      GridBase* grid  = psi._grid;
      int Ls          = this->Ls;
      int LLs         = grid->_rdimensions[0];
      const int nsimd = Simd::Nsimd();
      Vector<iSinglet<Simd>> u(LLs);
      Vector<iSinglet<Simd>> l(LLs);
      Vector<iSinglet<Simd>> d(LLs);
      Vector<iSinglet<Simd>> s(LLs);
      assert(Ls/LLs == nsimd);
      assert(phi.checkerboard == psi.checkerboard);
      chi.checkerboard = psi.checkerboard;
      // just directly address via type pun
      typedef typename Simd::scalar_type scalar_type;
      scalar_type* u_p = (scalar_type*) &u[0];
      scalar_type* l_p = (scalar_type*) &l[0];
      scalar_type* d_p = (scalar_type*) &d[0];
      scalar_type* s_p = (scalar_type*) &s[0];
      for(int o=0; o<LLs; o++){ // outer
      for(int i=0; i<nsimd; i++){ //inner
        int s   = o + i*LLs;
        int ss  = o*nsimd + i;
        u_p[ss] = upper[s];
        l_p[ss] = lower[s];
        d_p[ss] = diag[s];
        s_p[ss] = shift_coeffs[s];
      }}
      this->M5Dcalls++;
      this->M5Dtime -= usecond();
      assert(Nc == 3);
      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
        int vs     = (this->pm == 1) ? LLs-1 : 0;
        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
        for(int v=0; v<LLs; v++){
          vprefetch(psi[ss+v+LLs]);
          int vp = (v == LLs-1) ? 0     : v+1;
          int vm = (v == 0)     ? LLs-1 : v-1;
          Simd hp_00 = psi[ss+vp]()(2)(0);
          Simd hp_01 = psi[ss+vp]()(2)(1);
          Simd hp_02 = psi[ss+vp]()(2)(2);
          Simd hp_10 = psi[ss+vp]()(3)(0);
          Simd hp_11 = psi[ss+vp]()(3)(1);
          Simd hp_12 = psi[ss+vp]()(3)(2);
          Simd hm_00 = psi[ss+vm]()(0)(0);
          Simd hm_01 = psi[ss+vm]()(0)(1);
          Simd hm_02 = psi[ss+vm]()(0)(2);
          Simd hm_10 = psi[ss+vm]()(1)(0);
          Simd hm_11 = psi[ss+vm]()(1)(1);
          Simd hm_12 = psi[ss+vm]()(1)(2);
          if(vp <= v){
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
          }
          if(this->pm == 1 && vs <= v){
            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
          }
          if(vm >= v){
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
          }
          if(this->pm == -1 && vs >= v){
            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
          }
          // Can force these to real arithmetic and save 2x.
          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
          vstream(chi[ss+v]()(0)(0), p_00);
          vstream(chi[ss+v]()(0)(1), p_01);
          vstream(chi[ss+v]()(0)(2), p_02);
          vstream(chi[ss+v]()(1)(0), p_10);
          vstream(chi[ss+v]()(1)(1), p_11);
          vstream(chi[ss+v]()(1)(2), p_12);
          vstream(chi[ss+v]()(2)(0), p_20);
          vstream(chi[ss+v]()(2)(1), p_21);
          vstream(chi[ss+v]()(2)(2), p_22);
          vstream(chi[ss+v]()(3)(0), p_30);
          vstream(chi[ss+v]()(3)(1), p_31);
          vstream(chi[ss+v]()(3)(2), p_32);
        }
      }
      this->M5Dtime += usecond();
    #endif
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
  {
    GridBase* grid = psi._grid;
    int Ls  = this->Ls;
    int LLs = grid->_rdimensions[0];
    int nsimd = Simd::Nsimd();
    Vector<iSinglet<Simd>> u(LLs);
    Vector<iSinglet<Simd>> l(LLs);
    Vector<iSinglet<Simd>> d(LLs);
    assert(Ls/LLs == nsimd);
    assert(phi.checkerboard == psi.checkerboard);
    chi.checkerboard = psi.checkerboard;
    // just directly address via type pun
    typedef typename Simd::scalar_type scalar_type;
    scalar_type* u_p = (scalar_type*) &u[0];
    scalar_type* l_p = (scalar_type*) &l[0];
    scalar_type* d_p = (scalar_type*) &d[0];
    for(int o=0; o<LLs; o++){ // outer
    for(int i=0; i<nsimd; i++){ //inner
      int s  = o + i*LLs;
      int ss = o*nsimd + i;
      u_p[ss] = upper[s];
      l_p[ss] = lower[s];
      d_p[ss] = diag[s];
    }}
    this->M5Dcalls++;
    this->M5Dtime -= usecond();
    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
      #if 0
        alignas(64) SiteHalfSpinor hp;
        alignas(64) SiteHalfSpinor hm;
        alignas(64) SiteSpinor fp;
        alignas(64) SiteSpinor fm;
        for(int v=0; v<LLs; v++){
          int vp = (v+1)%LLs;
          int vm = (v+LLs-1)%LLs;
          spProj5p(hp, psi[ss+vp]);
          spProj5m(hm, psi[ss+vm]);
          if(vp <= v){ rotate(hp, hp, 1); }
          if(vm >= v){ rotate(hm, hm, nsimd-1); }
          hp = hp*0.5;
          hm = hm*0.5;
          spRecon5p(fp, hp);
          spRecon5m(fm, hm);
          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
          chi[ss+v] = chi[ss+v]     +l[v]*fm;
        }
      #else
        for(int v=0; v<LLs; v++){
          vprefetch(psi[ss+v+LLs]);
          int vp = (v == LLs-1) ? 0     : v+1;
          int vm = (v == 0    ) ? LLs-1 : v-1;
          Simd hp_00 = psi[ss+vp]()(0)(0);
          Simd hp_01 = psi[ss+vp]()(0)(1);
          Simd hp_02 = psi[ss+vp]()(0)(2);
          Simd hp_10 = psi[ss+vp]()(1)(0);
          Simd hp_11 = psi[ss+vp]()(1)(1);
          Simd hp_12 = psi[ss+vp]()(1)(2);
          Simd hm_00 = psi[ss+vm]()(2)(0);
          Simd hm_01 = psi[ss+vm]()(2)(1);
          Simd hm_02 = psi[ss+vm]()(2)(2);
          Simd hm_10 = psi[ss+vm]()(3)(0);
          Simd hm_11 = psi[ss+vm]()(3)(1);
          Simd hm_12 = psi[ss+vm]()(3)(2);
          if (vp <= v){
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
          }
          if(vm >= v){
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
          }
          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
          vstream(chi[ss+v]()(0)(0), p_00);
          vstream(chi[ss+v]()(0)(1), p_01);
          vstream(chi[ss+v]()(0)(2), p_02);
          vstream(chi[ss+v]()(1)(0), p_10);
          vstream(chi[ss+v]()(1)(1), p_11);
          vstream(chi[ss+v]()(1)(2), p_12);
          vstream(chi[ss+v]()(2)(0), p_20);
          vstream(chi[ss+v]()(2)(1), p_21);
          vstream(chi[ss+v]()(2)(2), p_22);
          vstream(chi[ss+v]()(3)(0), p_30);
          vstream(chi[ss+v]()(3)(1), p_31);
          vstream(chi[ss+v]()(3)(2), p_32);
        }
      #endif
    }
    this->M5Dtime += usecond();
  }
  template<class Impl>
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
    std::vector<Coeff_t>& shift_coeffs)
  {
    #if 0
      this->M5Ddag(psi, phi, chi, lower, diag, upper);
      // FIXME: possible gain from vectorizing shift operation as well?
      Coeff_t one(1.0);
      int Ls = this->Ls;
      for(int s=0; s<Ls; s++){
        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
      }
    #else
      GridBase* grid = psi._grid;
      int Ls  = this->Ls;
      int LLs = grid->_rdimensions[0];
      int nsimd = Simd::Nsimd();
      Vector<iSinglet<Simd>> u(LLs);
      Vector<iSinglet<Simd>> l(LLs);
      Vector<iSinglet<Simd>> d(LLs);
      Vector<iSinglet<Simd>> s(LLs);
      assert(Ls/LLs == nsimd);
      assert(phi.checkerboard == psi.checkerboard);
      chi.checkerboard = psi.checkerboard;
      // just directly address via type pun
      typedef typename Simd::scalar_type scalar_type;
      scalar_type* u_p = (scalar_type*) &u[0];
      scalar_type* l_p = (scalar_type*) &l[0];
      scalar_type* d_p = (scalar_type*) &d[0];
      scalar_type* s_p = (scalar_type*) &s[0];
      for(int o=0; o<LLs; o++){ // outer
      for(int i=0; i<nsimd; i++){ //inner
        int s  = o + i*LLs;
        int ss = o*nsimd + i;
        u_p[ss] = upper[s];
        l_p[ss] = lower[s];
        d_p[ss] = diag[s];
        s_p[ss] = shift_coeffs[s];
      }}
      this->M5Dcalls++;
      this->M5Dtime -= usecond();
      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
        int vs     = (this->pm == 1) ? LLs-1 : 0;
        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
        for(int v=0; v<LLs; v++){
          vprefetch(psi[ss+v+LLs]);
          int vp = (v == LLs-1) ? 0     : v+1;
          int vm = (v == 0    ) ? LLs-1 : v-1;
          Simd hp_00 = psi[ss+vp]()(0)(0);
          Simd hp_01 = psi[ss+vp]()(0)(1);
          Simd hp_02 = psi[ss+vp]()(0)(2);
          Simd hp_10 = psi[ss+vp]()(1)(0);
          Simd hp_11 = psi[ss+vp]()(1)(1);
          Simd hp_12 = psi[ss+vp]()(1)(2);
          Simd hm_00 = psi[ss+vm]()(2)(0);
          Simd hm_01 = psi[ss+vm]()(2)(1);
          Simd hm_02 = psi[ss+vm]()(2)(2);
          Simd hm_10 = psi[ss+vm]()(3)(0);
          Simd hm_11 = psi[ss+vm]()(3)(1);
          Simd hm_12 = psi[ss+vm]()(3)(2);
          if (vp <= v){
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
          }
          if(this->pm == 1 && vs <= v){
            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
          }
          if(vm >= v){
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
          }
          if(this->pm == -1 && vs >= v){
            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
          }
          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
          vstream(chi[ss+v]()(0)(0), p_00);
          vstream(chi[ss+v]()(0)(1), p_01);
          vstream(chi[ss+v]()(0)(2), p_02);
          vstream(chi[ss+v]()(1)(0), p_10);
          vstream(chi[ss+v]()(1)(1), p_11);
          vstream(chi[ss+v]()(1)(2), p_12);
          vstream(chi[ss+v]()(2)(0), p_20);
          vstream(chi[ss+v]()(2)(1), p_21);
          vstream(chi[ss+v]()(2)(2), p_22);
          vstream(chi[ss+v]()(3)(0), p_30);
          vstream(chi[ss+v]()(3)(1), p_31);
          vstream(chi[ss+v]()(3)(2), p_32);
        }
      }
      this->M5Dtime += usecond();
    #endif
  }
  #ifdef AVX512
    #include<simd/Intel512common.h>
    #include<simd/Intel512avx.h>
    #include<simd/Intel512single.h>
  #endif
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
  {
    #ifndef AVX512
      {
        SiteHalfSpinor BcastP;
        SiteHalfSpinor BcastM;
        SiteHalfSpinor SiteChiP;
        SiteHalfSpinor SiteChiM;
        // Ls*Ls * 2 * 12 * vol flops
        for(int s1=0; s1<LLs; s1++){
          for(int s2=0; s2<LLs; s2++){
          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
            int s = s2 + l*LLs;
            int lex = s2 + LLs*site;
            if( s2==0 && l==0 ){
              SiteChiP=zero;
              SiteChiM=zero;
            }
            for(int sp=0; sp<2;  sp++){
            for(int co=0; co<Nc; co++){
              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
            }}
            for(int sp=0; sp<2;  sp++){
            for(int co=0; co<Nc; co++){
              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
            }}
            for(int sp=0; sp<2;  sp++){
            for(int co=0; co<Nc; co++){
              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
            }}
          }}
          {
            int lex = s1 + LLs*site;
            for(int sp=0; sp<2;  sp++){
            for(int co=0; co<Nc; co++){
              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
            }}
          }
        }
      }
    #else
      {
        // pointers
        //  MASK_REGS;
        #define Chi_00 %%zmm1
        #define Chi_01 %%zmm2
        #define Chi_02 %%zmm3
        #define Chi_10 %%zmm4
        #define Chi_11 %%zmm5
        #define Chi_12 %%zmm6
        #define Chi_20 %%zmm7
        #define Chi_21 %%zmm8
        #define Chi_22 %%zmm9
        #define Chi_30 %%zmm10
        #define Chi_31 %%zmm11
        #define Chi_32 %%zmm12
        #define BCAST0  %%zmm13
        #define BCAST1  %%zmm14
        #define BCAST2  %%zmm15
        #define BCAST3  %%zmm16
        #define BCAST4  %%zmm17
        #define BCAST5  %%zmm18
        #define BCAST6  %%zmm19
        #define BCAST7  %%zmm20
        #define BCAST8  %%zmm21
        #define BCAST9  %%zmm22
        #define BCAST10 %%zmm23
        #define BCAST11 %%zmm24
        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
        for(int s1=0; s1<LLs; s1++){
          for(int s2=0; s2<LLs; s2++){
            int lex = s2 + LLs*site;
            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
            uint64_t a2 = (uint64_t) &psi[lex];
            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
              if((s2+l)==0) {
                asm(
                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
                      VBCASTCDUP(0,%2,BCAST0)
                      VBCASTCDUP(1,%2,BCAST1)
                      VBCASTCDUP(2,%2,BCAST2)
                      VBCASTCDUP(3,%2,BCAST3)
                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
                      VMULMEM(0,%1,BCAST8,Chi_22)
                      VMULMEM(0,%1,BCAST9,Chi_30)
                      VMULMEM(0,%1,BCAST10,Chi_31)
                      VMULMEM(0,%1,BCAST11,Chi_32)
                      : : "r" (a0), "r" (a1), "r" (a2)                            );
              } else {
                asm(
                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
                      : : "r" (a0), "r" (a1), "r" (a2)                            );
              }
              a0 = a0 + incr;
              a1 = a1 + incr;
              a2 = a2 + sizeof(Simd::scalar_type);
            }
          }
          {
            int lexa = s1+LLs*site;
            asm (
               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
          }
        }
      }
      #undef Chi_00
      #undef Chi_01
      #undef Chi_02
      #undef Chi_10
      #undef Chi_11
      #undef Chi_12
      #undef Chi_20
      #undef Chi_21
      #undef Chi_22
      #undef Chi_30
      #undef Chi_31
      #undef Chi_32
      #undef BCAST0
      #undef BCAST1
      #undef BCAST2
      #undef BCAST3
      #undef BCAST4
      #undef BCAST5
      #undef BCAST6
      #undef BCAST7
      #undef BCAST8
      #undef BCAST9
      #undef BCAST10
      #undef BCAST11
    #endif
  };
  // Z-mobius version
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
  {
    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
    exit(-1);
  };
  template<class Impl>
  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
  {
    int Ls  = this->Ls;
    int LLs = psi._grid->_rdimensions[0];
    int vol = psi._grid->oSites()/LLs;
    chi.checkerboard = psi.checkerboard;
    Vector<iSinglet<Simd>>   Matp;
    Vector<iSinglet<Simd>>   Matm;
    Vector<iSinglet<Simd>>* _Matp;
    Vector<iSinglet<Simd>>* _Matm;
    //  MooeeInternalCompute(dag,inv,Matp,Matm);
    if(inv && dag){
      _Matp = &this->MatpInvDag;
      _Matm = &this->MatmInvDag;
    }
    if(inv && (!dag)){
      _Matp = &this->MatpInv;
      _Matm = &this->MatmInv;
    }
    if(!inv){
      MooeeInternalCompute(dag, inv, Matp, Matm);
      _Matp = &Matp;
      _Matm = &Matm;
    }
    assert(_Matp->size() == Ls*LLs);
    this->MooeeInvCalls++;
    this->MooeeInvTime -= usecond();
    if(switcheroo<Coeff_t>::iscomplex()){
      parallel_for(auto site=0; site<vol; site++){
        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
      }
    } else {
      parallel_for(auto site=0; site<vol; site++){
        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
      }
    }
    this->MooeeInvTime += usecond();
  }
  #ifdef MOBIUS_EOFA_DPERP_VEC
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
  #endif
 }}
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 template<class vobj,class cobj>
 class WilsonStencil : public CartesianStencil<vobj,cobj> {
 public:
-
+  double timer0;
  double timer1;
  double timer2;
  double timer3;
  double timer4;
  double timer5;
  double timer6;
  uint64_t callsi;
  void ZeroCountersi(void)
  {
    timer0=0;
    timer1=0;
    timer2=0;
    timer3=0;
    timer4=0;
    timer5=0;
    timer6=0;
    callsi=0;
  }
  void Reporti(int calls)
  {
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  std::vector<int> same_node;
@ -252,6 +278,7 @@ public:
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
    same_node(npoints)
  { 
    ZeroCountersi();
    surface_list.resize(0);
  };
@ -261,7 +288,6 @@ public:
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      same_node[point] = this->SameNode(point);
      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
    }
    for(int site = 0 ;site< vol4;site++){
@ -282,17 +308,28 @@ public:
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->HaloExchangeOptGather(source,compress);
-    this->CommunicateBegin(reqs);
+    double t1=usecond();
-    this->CommunicateComplete(reqs);
+    // Asynchronous MPI calls multidirectional, Isend etc...
    //    this->CommunicateBegin(reqs);
    //    this->CommunicateComplete(reqs);
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
    this->CommsMerge(compress);
    double t3=usecond(); timer2 += t3-t2;
    this->CommsMergeSHM(compress);
    double t4=usecond(); timer3 += t4-t3;
  }
  template <class compressor>
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
  {
    this->Prepare();
    double t0=usecond();
    this->HaloGatherOpt(source,compress);
    double t1=usecond();
    timer0 += t1-t0;
    callsi++;
  }
  template <class compressor>
@ -304,7 +341,9 @@ public:
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
    this->mpi3synctime_g-=usecond();
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();
    assert(source._grid==this->_grid);
    this->halogtime-=usecond();
@ -323,7 +362,6 @@ public:
    int dag = compress.dag;
    int face_idx=0;
    if ( dag ) { 
      //	std::cout << " Optimised Dagger compress " <<std::endl;
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  int vol4;
  vol4=FourDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4=FourDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
-  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
+  RealD NP     = _FourDimGrid->_Nprocessors;
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NN     = _FourDimGrid->NodeCount();
-    RealD NP = _FourDimGrid->_Nprocessors;
+  RealD volume = Ls;  
-    RealD NN = _FourDimGrid->NodeCount();
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  Compressor compressor(dag);
@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  std::vector<std::vector<CommsRequest_t> > reqs;
  // Rely on async comms; start comms before merge of local data
  DhopCommTime-=usecond();
  st.CommunicateBegin(reqs);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+=usecond();
-  // Perhaps use omp task and region
+  double ctime=0;
-#pragma omp parallel 
+  double ptime=0;
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Ugly explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
+    int ncomms = CartesianCommunicator::nCommThreads;
-    int myoff, mywork;
+    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
      nthreads -= ncomms;
      int ttid = tid - ncomms;
      int n = U._grid->oSites();
      int chunk = n / nthreads;
      int rem = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
 	myblock = ttid * chunk + ttid;
 	myn = chunk+1;
      } else {
 	myblock = ttid*chunk + rem;
 	myn = chunk;
      }
-    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
+      // do the compute
-    int sF = LLs * myoff;
+      if (dag == DaggerYes) {
-
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-    if ( me == 0 ) {
+	  int sU = ss;
-      st.CommunicateComplete(reqs);
+	  int sF = LLs * sU;
-      DhopCommTime+=usecond();
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-    } else { 
+	}
-      // Interior links in stencil
+      } else {
-      if ( me==1 ) DhopComputeTime-=usecond();
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sU = ss;
-      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sF = LLs * sU;
-      if ( me==1 ) DhopComputeTime+=usecond();
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 	}
      }
 	ptime = usecond() - start;
    }
    {
      double start = usecond();
      st.CommunicateThreaded();
      ctime = usecond() - start;
    }
  }
  DhopCommTime += ctime;
  DhopComputeTime+=ptime;
  // First to enter, last to leave timing
  st.CollateThreads();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  // Load imbalance alert. Should use dynamic schedule OMP for loop
  // Perhaps create a list of only those sites with face work, and 
  // load balance process the list.
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 #else 
  assert(0);
 #endif
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -30,60 +30,181 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
-#define LOAD_CHIMU \
+#define LOAD_CHIMU_BODY(F)			\
-  {const SiteSpinor & ref (in._odata[offset]);	\
+  Chimu_00=ref(F)(0)(0);			\
-    Chimu_00=ref()(0)(0);\
+  Chimu_01=ref(F)(0)(1);			\
-    Chimu_01=ref()(0)(1);\
+  Chimu_02=ref(F)(0)(2);			\
-    Chimu_02=ref()(0)(2);\
+  Chimu_10=ref(F)(1)(0);			\
-    Chimu_10=ref()(1)(0);\
+  Chimu_11=ref(F)(1)(1);			\
-    Chimu_11=ref()(1)(1);\
+  Chimu_12=ref(F)(1)(2);			\
-    Chimu_12=ref()(1)(2);\
+  Chimu_20=ref(F)(2)(0);			\
-    Chimu_20=ref()(2)(0);\
+  Chimu_21=ref(F)(2)(1);			\
-    Chimu_21=ref()(2)(1);\
+  Chimu_22=ref(F)(2)(2);			\
-    Chimu_22=ref()(2)(2);\
+  Chimu_30=ref(F)(3)(0);			\
-    Chimu_30=ref()(3)(0);\
+  Chimu_31=ref(F)(3)(1);			\
-    Chimu_31=ref()(3)(1);\
+  Chimu_32=ref(F)(3)(2)
    Chimu_32=ref()(3)(2);}
-#define LOAD_CHI\
+#define LOAD_CHIMU(DIR,F,PERM)						\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
+  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
-    Chi_00 = ref()(0)(0);\
+
-    Chi_01 = ref()(0)(1);\
+#define LOAD_CHI_BODY(F)				\
-    Chi_02 = ref()(0)(2);\
+    Chi_00 = ref(F)(0)(0);\
-    Chi_10 = ref()(1)(0);\
+    Chi_01 = ref(F)(0)(1);\
-    Chi_11 = ref()(1)(1);\
+    Chi_02 = ref(F)(0)(2);\
-    Chi_12 = ref()(1)(2);}
+    Chi_10 = ref(F)(1)(0);\
    Chi_11 = ref(F)(1)(1);\
    Chi_12 = ref(F)(1)(2)
 #define LOAD_CHI(DIR,F,PERM)					\
  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 //G-parity implementations using in-place intrinsic ops
 //1l 1h -> 1h 1l
 //0l 0h , 1h 1l -> 0l 1h 0h,1l
 //0h,1l -> 1l,0h
 //if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
 //Pulled fermion through forwards face, GPBC on upper component
 //Need 0= 0l 1h   1= 1l 0h
 //else if( (distance == -1 && !perm) || (distance == 1 && perm) )
 //Pulled fermion through backwards face, GPBC on lower component
 //Need 0= 1l 0h   1= 0l 1h
 //1l 1h -> 1h 1l
 //0l 0h , 1h 1l -> 0l 1h 0h,1l
 #define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
  permute##PERM(tmp1, ref(1)(S)(C));				\
  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
  INTO = tmp2;
 //0l 0h -> 0h 0l
 //1l 1h, 0h 0l -> 1l 0h, 1h 0l
 #define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
  permute##PERM(tmp1, ref(0)(S)(C));				\
  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
  INTO = tmp2;
 #define LOAD_CHI_SETUP(DIR,F)						\
  g = F;								\
  direction = st._directions[DIR];				\
  distance = st._distances[DIR];				\
  sl = st._grid->_simd_layout[direction];			\
  inplace_twist = 0;						\
  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
    if(sl == 1){							\
      g = (F+1) % 2;							\
    }else{								\
      inplace_twist = 1;						\
    }									\
  }  
 #define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
  { const SiteSpinor &ref(in._odata[offset]);				\
    LOAD_CHI_SETUP(DIR,F);						\
    if(!inplace_twist){							\
      LOAD_CHIMU_BODY(g);						\
    }else{								\
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
      }else{								\
 	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
      } \
    } \
  }
 #define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
  { const SiteHalfSpinor &ref(buf[offset]);				\
    LOAD_CHI_SETUP(DIR,F);						\
    if(!inplace_twist){							\
      LOAD_CHI_BODY(g);							\
    }else{								\
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
      }else{								\
 	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
      }									\
    }									\
  }
 #define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 #define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 // To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
+#define MULT_2SPIN_BODY \
-  {auto & ref(U._odata[sU](A));			\
+  Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
+  Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
+  Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
+  Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
+  Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
+  Impl::loadLinkElement(U_21,ref()(2,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
+  UChi_00 = U_00*Chi_00;			\
-    UChi_00 = U_00*Chi_00;\
+  UChi_10 = U_00*Chi_10;			\
-    UChi_10 = U_00*Chi_10;\
+  UChi_01 = U_10*Chi_00;			\
-    UChi_01 = U_10*Chi_00;\
+  UChi_11 = U_10*Chi_10;			\
-    UChi_11 = U_10*Chi_10;\
+  UChi_02 = U_20*Chi_00;			\
-    UChi_02 = U_20*Chi_00;\
+  UChi_12 = U_20*Chi_10;			\
-    UChi_12 = U_20*Chi_10;\
+  UChi_00+= U_01*Chi_01;			\
-    UChi_00+= U_01*Chi_01;\
+  UChi_10+= U_01*Chi_11;			\
-    UChi_10+= U_01*Chi_11;\
+  UChi_01+= U_11*Chi_01;			\
-    UChi_01+= U_11*Chi_01;\
+  UChi_11+= U_11*Chi_11;			\
-    UChi_11+= U_11*Chi_11;\
+  UChi_02+= U_21*Chi_01;			\
-    UChi_02+= U_21*Chi_01;\
+  UChi_12+= U_21*Chi_11;			\
-    UChi_12+= U_21*Chi_11;\
+  Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
+  Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
+  Impl::loadLinkElement(U_20,ref()(2,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
+  UChi_00+= U_00*Chi_02;			\
-    UChi_00+= U_00*Chi_02;\
+  UChi_10+= U_00*Chi_12;			\
-    UChi_10+= U_00*Chi_12;\
+  UChi_01+= U_10*Chi_02;			\
-    UChi_01+= U_10*Chi_02;\
+  UChi_11+= U_10*Chi_12;			\
-    UChi_11+= U_10*Chi_12;\
+  UChi_02+= U_20*Chi_02;			\
-    UChi_02+= U_20*Chi_02;\
+  UChi_12+= U_20*Chi_12
-    UChi_12+= U_20*Chi_12;}
+
 #define MULT_2SPIN(A,F)					\
  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
 #define MULT_2SPIN_GPARITY(A,F)				\
  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
 #define PERMUTE_DIR(dir)			\
@ -307,84 +428,87 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_31-= UChi_11;	\
  result_32-= UChi_12;
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else {					\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
  }						\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN_IMPL(DIR,F);			\
  RECON;					
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
+
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
  }						\
  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
+    MULT_2SPIN_IMPL(DIR,F);			\
    RECON;					\
  }
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
-    MULT_2SPIN(DIR);				\
+    MULT_2SPIN_IMPL(DIR,F);			\
    RECON;					\
    nmu++;					\
  }
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss,F)			\
  {						\
    SiteSpinor & ref (out._odata[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
+    vstream(ref(F)(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
+    vstream(ref(F)(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
+    vstream(ref(F)(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
+    vstream(ref(F)(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
+    vstream(ref(F)(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
+    vstream(ref(F)(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
+    vstream(ref(F)(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
+    vstream(ref(F)(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
+    vstream(ref(F)(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
+    vstream(ref(F)(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
+    vstream(ref(F)(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
+    vstream(ref(F)(3)(2),result_32);		\
  }
-#define HAND_RESULT_EXT(ss)			\
+#define HAND_RESULT_EXT(ss,F)			\
  if (nmu){					\
    SiteSpinor & ref (out._odata[ss]);		\
-    ref()(0)(0)+=result_00;		\
+    ref(F)(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
+    ref(F)(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
+    ref(F)(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
+    ref(F)(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
+    ref(F)(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
+    ref(F)(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
+    ref(F)(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
+    ref(F)(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
+    ref(F)(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
+    ref(F)(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
+    ref(F)(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
+    ref(F)(3)(2)+=result_32;		\
  }
@ -463,15 +587,18 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
  int offset,local,perm, ptype;
  StencilEntry *SE;
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
+#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)
  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 template<class Impl>
@ -486,15 +613,18 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
  StencilEntry *SE;
  int offset,local,perm, ptype;
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
+#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)
  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 template<class Impl> void 
@ -509,16 +639,20 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  int offset,local,perm, ptype;
  StencilEntry *SE;
-  ZERO_RESULT;
+
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  ZERO_RESULT; \
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT(ss);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT(ss,F)
  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 template<class Impl>
@ -532,16 +666,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
  StencilEntry *SE;
  int offset,local,perm, ptype;
-  ZERO_RESULT;
+
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  ZERO_RESULT;							\
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
-  HAND_RESULT(ss);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
  HAND_RESULT(ss,F)
  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 template<class Impl> void 
@ -557,16 +695,20 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int nmu=0;
-  ZERO_RESULT;
+
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)
  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 template<class Impl>
@ -581,16 +723,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
  StencilEntry *SE;
  int offset,local,perm, ptype;
  int nmu=0;
-  ZERO_RESULT;
+
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  ZERO_RESULT; \
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
-  HAND_RESULT_EXT(ss);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
  HAND_RESULT_EXT(ss,F)
  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
  ////////////////////////////////////////////////
@ -646,10 +792,123 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 				    const FermionField &in,		\
 				    FermionField &out){ assert(0); }	\
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplF);
+
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplD);
+
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplFH);
+#define HAND_SPECIALISE_GPARITY(IMPL)					\
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplDF);
+  template<> void							\
  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 				    int ss,int sU,const FermionField &in, FermionField &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
    typedef typename Simd::vector_type V;				\
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
    StencilEntry *SE;							\
    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
  template<>								\
  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 					    int ss,int sU,const FermionField &in, FermionField &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
    typedef typename Simd::vector_type V;				\
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
    StencilEntry *SE;							\
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
  template<> void							\
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 						     int ss,int sU,const FermionField &in, FermionField &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
    typedef typename Simd::vector_type V;				\
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
    StencilEntry *SE;							\
    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
  template<>								\
  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 							     int ss,int sU,const FermionField &in, FermionField &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
    typedef typename Simd::vector_type V;				\
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
    StencilEntry *SE;							\
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
 									\
  template<> void							\
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 						     int ss,int sU,const FermionField &in, FermionField &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
    typedef typename Simd::vector_type V;				\
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
    StencilEntry *SE;							\
    int nmu=0;								\
    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    nmu = 0;								\
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
  template<>								\
  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 							     int ss,int sU,const FermionField &in, FermionField &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
    typedef typename Simd::vector_type V;				\
 									\
    HAND_DECLARATIONS(ignore);						\
 									\
    StencilEntry *SE;							\
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
    int nmu=0;								\
    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
    nmu = 0;								\
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }
 HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
 HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
 HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
 HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
 ////////////// Wilson ; uses this implementation /////////////////////
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@ -0,0 +1,286 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/gauge/Photon.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 namespace Grid{
 namespace QCD{
  template <class S>
  class QedGimpl
  {
  public:
    typedef S Simd;
    template <typename vtype>
    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
    template <typename vtype>
    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
    typedef SiteField             SiteComplex;
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
  typedef QedGimpl<vComplex> QedGimplR;
  template<class Gimpl>
  class Photon
  {
  public:
    INHERIT_GIMPL_TYPES(Gimpl);
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
  public:
    Photon(Gauge gauge, ZmScheme zmScheme);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
    void StochasticWeight(GaugeLinkField &weight);
    void StochasticField(GaugeField &out, GridParallelRNG &rng);
    void StochasticField(GaugeField &out, GridParallelRNG &rng,
                         const GaugeLinkField &weight);
  private:
    void invKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
  private:
    Gauge    gauge_;
    ZmScheme zmScheme_;
  };
  typedef Photon<QedGimplR>  PhotonR;
  template<class Gimpl>
  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
  : gauge_(gauge), zmScheme_(zmScheme)
  {}
  template<class Gimpl>
  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
  {
    FFT theFFT(in._grid);
    GaugeField in_k(in._grid);
    GaugeField prop_k(in._grid);
    theFFT.FFT_all_dim(in_k,in,FFT::forward);
    MomentumSpacePropagator(prop_k,in_k);
    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
  }
  template<class Gimpl>
  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    GaugeLinkField     kmu(grid), one(grid);
    const unsigned int nd    = grid->_ndimension;
    std::vector<int>   &l    = grid->_fdimensions;
    std::vector<int>   zm(nd,0);
    TComplex           Tone = Complex(1.0,0.0);
    TComplex           Tzero= Complex(0.0,0.0);
    one = Complex(1.0,0.0);
    out = zero;
    for(int mu = 0; mu < nd; mu++)
    {
      Real twoPiL = M_PI*2./l[mu];
      LatticeCoordinate(kmu,mu);
      kmu = 2.*sin(.5*twoPiL*kmu);
      out = out + kmu*kmu;
    }
    pokeSite(Tone, out, zm);
    out = one/out;
    pokeSite(Tzero, out, zm);
  }
  template<class Gimpl>
  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    const unsigned int nd    = grid->_ndimension;
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
        std::vector<int> zm(nd,0);
        TComplex         Tzero = Complex(0.0,0.0);
        pokeSite(Tzero, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
        LatticeInteger spNrm(grid), coor(grid);
        GaugeLinkField z(grid);
        spNrm = zero;
        for(int d = 0; d < grid->_ndimension - 1; d++)
        {
          LatticeCoordinate(coor,d);
          spNrm = spNrm + coor*coor;
        }
        out = where(spNrm == Integer(0), 0.*out, out);
        break;
      }
      default:
        break;
    }
  }
  template<class Gimpl>
  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
                                               GaugeField &out)
  {
    GridBase           *grid = out._grid;
    LatticeComplex     k2Inv(grid);
    invKHatSquared(k2Inv);
    zmSub(k2Inv);
    out = in*k2Inv;
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
  {
    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
    const unsigned int nd        = grid->_ndimension;
    std::vector<int>   latt_size = grid->_fdimensions;
    Integer vol = 1;
    for(int d = 0; d < nd; d++)
    {
      vol = vol * latt_size[d];
    }
    invKHatSquared(weight);
    weight = sqrt(vol*real(weight));
    zmSub(weight);
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
    GaugeLinkField weight(grid);
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
    const unsigned int nd = grid->_ndimension;
    GaugeLinkField     r(grid);
    GaugeField         aTilde(grid);
    FFT                fft(grid);
    for(int mu = 0; mu < nd; mu++)
    {
      gaussian(rng, r);
      r = weight*r;
      pokeLorentz(aTilde, r, mu);
    }
    fft.FFT_all_dim(out, aTilde, FFT::backward);
    out = real(out);
  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
 //                                                            const GaugeField &in)
 //  {
 //    
 //    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 //    
 //    GridBase *grid = out._grid;
 //    LatticeInteger     coor(grid);
 //    GaugeField zz(grid); zz=zero;
 //    
 //    // xyzt
 //    for(int d = 0; d < grid->_ndimension-1;d++){
 //      LatticeCoordinate(coor,d);
 //      out = where(coor==Integer(0),zz,out);
 //    }
 //  }
 //  
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
 //                                                             const GaugeField &in)
 //  {
 //    
 //    // what type LatticeComplex
 //    GridBase *grid = out._grid;
 //    int nd = grid->_ndimension;
 //    
 //    typedef typename GaugeField::vector_type vector_type;
 //    typedef typename GaugeField::scalar_type ScalComplex;
 //    typedef Lattice<iSinglet<vector_type> > LatComplex;
 //    
 //    std::vector<int> latt_size   = grid->_fdimensions;
 //    
 //    LatComplex denom(grid); denom= zero;
 //    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
 //    LatComplex   kmu(grid);
 //    
 //    ScalComplex ci(0.0,1.0);
 //    // momphase = n * 2pi / L
 //    for(int mu=0;mu<Nd;mu++) {
 //      
 //      LatticeCoordinate(kmu,mu);
 //      
 //      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 //      
 //      kmu = TwoPiL * kmu ;
 //      
 //      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
 //    }
 //    std::vector<int> zero_mode(nd,0);
 //    TComplexD Tone = ComplexD(1.0,0.0);
 //    TComplexD Tzero= ComplexD(0.0,0.0);
 //    
 //    pokeSite(Tone,denom,zero_mode);
 //    
 //    denom= one/denom;
 //    
 //    pokeSite(Tzero,denom,zero_mode);
 //    
 //    out = zero;
 //    out = in*denom;
 //  };
 }}
 #endif
--- a/lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@ -0,0 +1,264 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
 Copyright (C) 2017
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 /////////////////////////////////////////////////////////////////
 // Implementation of exact one flavour algorithm (EOFA)         //
 // using fermion classes defined in:                           //
 //    Grid/qcd/action/fermion/DomainWallEOFAFermion.h (Shamir) //
 //    Grid/qcd/action/fermion/MobiusEOFAFermion.h (Mobius)     //
 // arXiv: 1403.1683, 1706.05843                                //
 /////////////////////////////////////////////////////////////////
 #ifndef QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
 #define QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
 namespace Grid{
 namespace QCD{
  ///////////////////////////////////////////////////////////////
  // Exact one flavour implementation of DWF determinant ratio //
  ///////////////////////////////////////////////////////////////
  template<class Impl>
  class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
  {
    public:
      INHERIT_IMPL_TYPES(Impl);
      typedef OneFlavourRationalParams Params;
      Params param;
      MultiShiftFunction PowerNegHalf;
    private:
      bool use_heatbath_forecasting;
      AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
      AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
      SchurRedBlackDiagMooeeSolve<FermionField> Solver;
      FermionField Phi; // the pseudofermion field for this trajectory
    public:
      ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, AbstractEOFAFermion<Impl>& _Rop,
        OperatorFunction<FermionField>& S, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), Solver(S),
        Phi(_Lop.FermionGrid()), param(p), use_heatbath_forecasting(use_fc)
      {
        AlgRemez remez(param.lo, param.hi, param.precision);
        // MdagM^(+- 1/2)
        std::cout << GridLogMessage << "Generating degree " << param.degree << " for x^(-1/2)" << std::endl;
        remez.generateApprox(param.degree, 1, 2);
        PowerNegHalf.Init(remez, param.tolerance, true);
      };
      virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
      virtual std::string LogParameters() {
        std::stringstream sstream;
        sstream << GridLogMessage << "[" << action_name() << "] Low            :" << param.lo << std::endl;
        sstream << GridLogMessage << "[" << action_name() << "] High           :" << param.hi << std::endl;
        sstream << GridLogMessage << "[" << action_name() << "] Max iterations :" << param.MaxIter << std::endl;
        sstream << GridLogMessage << "[" << action_name() << "] Tolerance      :" << param.tolerance << std::endl;
        sstream << GridLogMessage << "[" << action_name() << "] Degree         :" << param.degree << std::endl;
        sstream << GridLogMessage << "[" << action_name() << "] Precision      :" << param.precision << std::endl;
        return sstream.str();
      }
      // Spin projection
      void spProj(const FermionField& in, FermionField& out, int sign, int Ls)
      {
        if(sign == 1){ for(int s=0; s<Ls; ++s){ axpby_ssp_pplus(out, 0.0, in, 1.0, in, s, s); } }
        else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
      }
      // EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
      // We generate a Gaussian noise vector \eta, and then compute
      //  \Phi = M_{\rm EOFA}^{-1/2} * \eta
      // using a rational approximation to the inverse square root
      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField eta         (Lop.FermionGrid());
        FermionField CG_src      (Lop.FermionGrid());
        FermionField CG_soln     (Lop.FermionGrid());
        FermionField Forecast_src(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
        // Use chronological inverter to forecast solutions across poles
        std::vector<FermionField> prev_solns;
        if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
        // Seed with Gaussian noise vector (var = 0.5)
        RealD scale = std::sqrt(0.5);
        gaussian(pRNG,eta);
        eta = eta * scale;
        printf("Heatbath source vector: <\\eta|\\eta> = %1.15e\n", norm2(eta));
        // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
        RealD N(PowerNegHalf.norm);
        for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
        Phi = eta * N;
        // LH terms:
        // \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
        //          - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
        RealD gamma_l(0.0);
        spProj(eta, tmp[0], -1, Lop.Ls);
        Lop.Omega(tmp[0], tmp[1], -1, 0);
        G5R5(CG_src, tmp[1]);
        tmp[1] = zero;
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
          Lop.RefreshShiftCoefficients(-gamma_l);
          if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
            Lop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Lop, Forecast_src, prev_solns);
            Solver(Lop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = zero; // Just use zero as the initial guess
            Solver(Lop, CG_src, CG_soln);
          }
          Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
        }
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        spProj(tmp[0], tmp[1], -1, Lop.Ls);
        Phi = Phi + tmp[1];
        // RH terms:
        // \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
        //          + \gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
        spProj(eta, tmp[0], 1, Rop.Ls);
        Rop.Omega(tmp[0], tmp[1], 1, 0);
        G5R5(CG_src, tmp[1]);
        tmp[1] = zero;
        if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
        for(int k=0; k<param.degree; ++k){
          gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
          Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
          if(use_heatbath_forecasting){
            Rop.Mdag(CG_src, Forecast_src);
            CG_soln = Forecast(Rop, Forecast_src, prev_solns);
            Solver(Rop, CG_src, CG_soln);
            prev_solns.push_back(CG_soln);
          } else {
            CG_soln = zero;
            Solver(Rop, CG_src, CG_soln);
          }
          Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
          tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
        }
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        spProj(tmp[0], tmp[1], 1, Rop.Ls);
        Phi = Phi + tmp[1];
        // Reset shift coefficients for energy and force evals
        Lop.RefreshShiftCoefficients(0.0);
        Rop.RefreshShiftCoefficients(-1.0);
      };
      // EOFA action: see Eqn. (10) of arXiv:1706.05843
      virtual RealD S(const GaugeField& U)
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField spProj_Phi(Lop.FermionGrid());
        std::vector<FermionField> tmp(2, Lop.FermionGrid());
        // S = <\Phi|\Phi>
        RealD action(norm2(Phi));
        // LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
        spProj(Phi, spProj_Phi, -1, Lop.Ls);
        Lop.Omega(spProj_Phi, tmp[0], -1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = zero;
        Solver(Lop, tmp[1], tmp[0]);
        Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
        Lop.Omega(tmp[1], tmp[0], -1, 1);
        action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
        // RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
        //               - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, tmp[0], 1, 0);
        G5R5(tmp[1], tmp[0]);
        tmp[0] = zero;
        Solver(Rop, tmp[1], tmp[0]);
        Rop.Dtilde(tmp[0], tmp[1]);
        Rop.Omega(tmp[1], tmp[0], 1, 1);
        action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
        return action;
      };
      // EOFA pseudofermion force: see Eqns. (34)-(36) of arXiv:1706.05843
      virtual void deriv(const GaugeField& U, GaugeField& dSdU)
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
        FermionField spProj_Phi      (Lop.FermionGrid());
        FermionField Omega_spProj_Phi(Lop.FermionGrid());
        FermionField CG_src          (Lop.FermionGrid());
        FermionField Chi             (Lop.FermionGrid());
        FermionField g5_R5_Chi       (Lop.FermionGrid());
        GaugeField force(Lop.GaugeGrid());
        // LH: dSdU = k \chi_{L}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{L}
        //     \chi_{L} = H(mf)^{-1} \Omega_{-} P_{-} \Phi
        spProj(Phi, spProj_Phi, -1, Lop.Ls);
        Lop.Omega(spProj_Phi, Omega_spProj_Phi, -1, 0);
        G5R5(CG_src, Omega_spProj_Phi);
        spProj_Phi = zero;
        Solver(Lop, CG_src, spProj_Phi);
        Lop.Dtilde(spProj_Phi, Chi);
        G5R5(g5_R5_Chi, Chi);
        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
        dSdU = Lop.k * force;
        // RH: dSdU = dSdU - k \chi_{R}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{}
        //     \chi_{R} = ( H(mb) - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \Phi
        spProj(Phi, spProj_Phi, 1, Rop.Ls);
        Rop.Omega(spProj_Phi, Omega_spProj_Phi, 1, 0);
        G5R5(CG_src, Omega_spProj_Phi);
        spProj_Phi = zero;
        Solver(Rop, CG_src, spProj_Phi);
        Rop.Dtilde(spProj_Phi, Chi);
        G5R5(g5_R5_Chi, Chi);
        Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
        dSdU = dSdU - Rop.k * force;
      };
  };
 }}
 #endif
--- a/lib/qcd/action/pseudofermion/PseudoFermion.h
+++ b/lib/qcd/action/pseudofermion/PseudoFermion.h
@ -38,5 +38,6 @@ directory
 #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
 #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
 #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
 #endif
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/lib/qcd/action/scalar/Scalar.h
@ -31,6 +31,7 @@ directory
 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
 #include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
 namespace Grid {
 namespace QCD {
@ -39,6 +40,10 @@ namespace QCD {
  typedef ScalarAction<ScalarImplF>                 ScalarActionF;
  typedef ScalarAction<ScalarImplD>                 ScalarActionD;
  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
 }
 }
--- a/lib/qcd/action/scalar/ScalarAction.h
+++ b/lib/qcd/action/scalar/ScalarAction.h
@ -6,10 +6,10 @@
  Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+  Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: paboyle <paboyle@ph.ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -36,49 +36,48 @@ directory
 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  template <class Impl>
+template <class Impl>
-  class ScalarAction : public QCD::Action<typename Impl::Field> {
+class ScalarAction : public QCD::Action<typename Impl::Field> {
-  public:
+ public:
    INHERIT_FIELD_TYPES(Impl);
-  private:
+ private:
    RealD mass_square;
    RealD lambda;
-  public:
+ public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
+    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
-    virtual std::string LogParameters(){
+    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
    }
    virtual std::string action_name() {return "ScalarAction";}
-    virtual std::string action_name(){return "ScalarAction";}
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
    virtual void refresh(const Field &U,
 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
    virtual RealD S(const Field &p) {
      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
+    ScalarObs<Impl>::sumphider(p);
    };
    virtual void deriv(const Field &p,
-		       Field &force) {
+                       Field &force) {
      Field tmp(p._grid);
      Field p2(p._grid);
      ScalarObs<Impl>::phisquared(p2, p);
      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
+    }
-  };
+};
-} // Grid
+
 }  // namespace Grid
 #endif // SCALAR_ACTION_H
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@ -5,20 +5,22 @@
 namespace Grid {
  //namespace QCD {
-  template <class S>
+template <class S>
-  class ScalarImplTypes {
+class ScalarImplTypes {
-  public:
+ public:
    typedef S Simd;
    template <typename vtype>
    using iImplField = iScalar<iScalar<iScalar<vtype> > >;
    typedef iImplField<Simd> SiteField;
-    
+    typedef SiteField        SitePropagator;
-    template <typename vtype> using iImplScalar= iScalar<iScalar<iScalar<vtype   > > >;
+    typedef SiteField        SiteComplex;
    typedef iImplScalar<Simd> ComplexField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
    typedef Field              FermionField;
    typedef Field              PropagatorField;
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
@ -26,11 +28,11 @@ namespace Grid {
    static inline Field projectForce(Field& P){return P;}
-    static inline void update_field(Field& P, Field& U, double ep){
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    static inline RealD FieldSquareNorm(Field& U){
+    static inline RealD FieldSquareNorm(Field& U) {
      return (- sum(trace(U*U))/2.0);
    }
@ -46,46 +48,91 @@ namespace Grid {
      U = 1.0;
    }
    static void MomentumSpacePropagator(Field &out, RealD m)
    {
      GridBase           *grid = out._grid;
      Field              kmu(grid), one(grid);
      const unsigned int nd    = grid->_ndimension;
      std::vector<int>   &l    = grid->_fdimensions;
      one = Complex(1.0,0.0);
      out = m*m;
      for(int mu = 0; mu < nd; mu++)
      {
        Real twoPiL = M_PI*2./l[mu];
        LatticeCoordinate(kmu,mu);
        kmu = 2.*sin(.5*twoPiL*kmu);
        out = out + kmu*kmu;
      }
      out = one/out;
    }
    static void FreePropagator(const Field &in, Field &out,
                               const Field &momKernel)
    {
      FFT   fft((GridCartesian *)in._grid);
      Field inFT(in._grid);
      fft.FFT_all_dim(inFT, in, FFT::forward);
      inFT = inFT*momKernel;
      fft.FFT_all_dim(out, inFT, FFT::backward);
    }
    static void FreePropagator(const Field &in, Field &out, RealD m)
    {
      Field momKernel(in._grid);
      MomentumSpacePropagator(momKernel, m);
      FreePropagator(in, out, momKernel);
    }
  };
  template <class S, unsigned int N>
-  class ScalarMatrixImplTypes {
+  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
    typedef QCD::SU<N> Group;
-    template <typename vtype> using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
+    template <typename vtype>
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
-    typedef iImplField<Simd> SiteField;
+    typedef iImplField<Simd>   SiteField;
-    typedef Lattice<SiteField> Field;
+    typedef SiteField          SitePropagator;
    typedef iImplComplex<Simd> SiteComplex;
-    template <typename vtype> using iImplScalar= iScalar<iScalar<iScalar<vtype   > > >;
+    typedef Lattice<SiteField>   Field;
-    typedef iImplScalar<Simd> ComplexField;
+    typedef Lattice<SiteComplex> ComplexField;
    typedef Field                FermionField;
    typedef Field                PropagatorField;
-    
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
      gaussian(pRNG, P);
    }
-    static inline Field projectForce(Field& P){return P;}
+    static inline Field projectForce(Field& P) {return P;}
-    static inline void update_field(Field& P, Field& U, double ep){
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    static inline RealD FieldSquareNorm(Field& U){
+    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(- sum(trace(U*U))*0.5).real());
+      return (TensorRemove(sum(trace(U*U))).real());
    }
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
    }
    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
    }
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = 1.0;
+      U = zero;
    }
  };
@ -96,6 +143,18 @@ namespace Grid {
  typedef ScalarImplTypes<vReal> ScalarImplR;
  typedef ScalarImplTypes<vRealF> ScalarImplF;
  typedef ScalarImplTypes<vRealD> ScalarImplD;
  typedef ScalarImplTypes<vComplex> ScalarImplCR;
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
  // Hardcoding here the size of the matrices
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
  //}
 }
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@ -6,10 +6,7 @@
  Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -30,55 +27,122 @@ directory
  *************************************************************************************/
 /*  END LEGAL */
-#ifndef SCALAR_ACTION_H
+#ifndef SCALAR_INT_ACTION_H
-#define SCALAR_ACTION_H
+#define SCALAR_INT_ACTION_H
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure
 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  template <class Impl>
+  template <class Impl, int Ndim >
  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
  public:
    INHERIT_FIELD_TYPES(Impl);
  private:
    RealD mass_square;
    RealD lambda;
  public:
    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
-    virtual std::string LogParameters(){
+    typedef typename Field::vector_object vobj;
    typedef CartesianStencil<vobj,vobj> Stencil;
    SimpleCompressor<vobj> compressor;
    int npoint = 2*Ndim;
    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
  public:
    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
      for (int mu = 0 ; mu < Ndim; mu++){
 		directions[mu]         = mu; directions[mu+Ndim]    = mu;
 		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
      }
    }
    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
    }
-    virtual std::string action_name(){return "ScalarAction";}
+    virtual std::string action_name() {return "ScalarAction";}
-    virtual void refresh(const Field &U,
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
    virtual RealD S(const Field &p) {
-      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
+      assert(p._grid->Nd() == Ndim);
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-	ScalarObs<Impl>::sumphider(p);
+      phiStencil.HaloExchange(p, compressor);
      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
      phisquared = p*p;
      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
      for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 	  int permute_type;
 	  StencilEntry *SE;
 	  vobj temp2;
 	  const vobj *temp, *t_p;
 	  SE = phiStencil.GetEntry(permute_type, mu, i);
 	  t_p  = &p._odata[i];
 	  if ( SE->_is_local ) {
 	    temp = &p._odata[SE->_offset];
 	    if ( SE->_permute ) {
 	      permute(temp2, *temp, permute_type);
 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 	    } else {
 	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 	    }
 	  } else {
 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
 	  }
 	}
 	//  action -= pshift*p + p*pshift;
      }
      // NB the trace in the algebra is normalised to 1/2
      // minus sign coming from the antihermitian fields
      return -(TensorRemove(sum(trace(action)))).real();
    };
-    virtual void deriv(const Field &p,
+    virtual void deriv(const Field &p, Field &force) {
-		       Field &force) {
+      assert(p._grid->Nd() == Ndim);
-      Field tmp(p._grid);
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-      Field p2(p._grid);
+      // move this outside
-      ScalarObs<Impl>::phisquared(p2, p);
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
+      phiStencil.HaloExchange(p, compressor);
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-    };
+      for (int point = 0; point < npoint; point++) {
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 	  const vobj *temp;
 	  vobj temp2;
 	  int permute_type;
 	  StencilEntry *SE;
 	  SE = phiStencil.GetEntry(permute_type, point, i);
 	  if ( SE->_is_local ) {
 	    temp = &p._odata[SE->_offset];
 	    if ( SE->_permute ) {
 	      permute(temp2, *temp, permute_type);
 	      force._odata[i] -= temp2;
 	    } else {
 	      force._odata[i] -= *temp;
 	    }
 	  } else {
 	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
 	  }
 	}
      }
    }
  };
-} // Grid
+}  // namespace Grid
-#endif // SCALAR_ACTION_H
+#endif  // SCALAR_INT_ACTION_H
--- a/Show More
+++ b/Show More