Checking in working version of Lanczos.

Works with CPS evolution
Zmobius test was wrong! (only mobius) checking in again
2025-08-06 06:27:12 +01:00 · 2017-03-21 16:45:33 -04:00 · 2017-03-21 12:40:43 -04:00 · 2017-03-16 23:04:28 -04:00
471 changed files with 13533 additions and 75855 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -92,7 +92,6 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
 .vscode
 # Eigen source #
 ################
@@ -107,10 +106,6 @@ lib/fftw/*
 m4/lt*
 m4/libtool.m4
 # github pages #
 ################
 gh-pages/
 # Buck files #
 ##############
 .buck*
@@ -121,5 +116,4 @@ make-bin-BUCK.sh
 # generated sources #
 #####################
 lib/qcd/spin/gamma-gen/*.h
-lib/qcd/spin/gamma-gen/*.cc
+lib/qcd/spin/gamma-gen/*.cc
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,8 +7,64 @@ cache:
 matrix:
  include:
    - os:        osx
-      osx_image: xcode8.3
+      osx_image: xcode7.2
      compiler: clang
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
@@ -17,15 +73,13 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 install:
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which autoconf
    - autoconf  --version
    - which automake
    - automake  --version
    - which $CC
    - $CC  --version
    - which $CXX
@@ -38,9 +92,15 @@ script:
    - cd build
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4 
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
+    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
+    - ./benchmarks/Benchmark_dwf --threads 1
-    - make check
+    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
    - make -j4
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,15 +3,10 @@ SUBDIRS = lib benchmarks tests extras
 include $(top_srcdir)/doxygen.inc
-bin_SCRIPTS=grid-config
+tests: all
 	$(MAKE) -C tests tests
-
+.PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
 .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
 tests-local: all
 bench-local: all
 check-local: all
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@@ -1,13 +1,41 @@
-# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
+# Grid
 <table>
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
 **Data parallel C++ mathematical object library.**
 License: GPL v2.
-Last update June 2017.
+Last update Nov 2016.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Bug report
 _To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._
 When you file an issue, please go though the following checklist:
 1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number. 
 2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
 3. Give the exact `configure` command used.
 4. Attach `config.log`.
 5. Attach `config.summary`.
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 ### Description
@@ -30,68 +58,13 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
+Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
+Please see https://arxiv.org/abs/1512.03487 for more detail.
 ### Compilers
 Intel ICPC v16.0.3 and later
 Clang v3.5 and later (need 3.8 and later for OpenMP)
 GCC   v4.9.x (recommended)
 GCC   v6.3 and later
 ### Important: 
 Some versions of GCC appear to have a bug under high optimisation (-O2, -O3).
 The safety of these compiler versions cannot be guaranteed at this time. Follow Issue 100 for details and updates.
 GCC   v5.x
 GCC   v6.1, v6.2
 ### Bug report
 _To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._
 When you file an issue, please go though the following checklist:
 1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number. 
 2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
 3. Give the exact `configure` command used.
 4. Attach `config.log`.
 5. Attach `grid.config.summary`.
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 ### Required libraries
 Grid requires:
 [GMP](https://gmplib.org/), 
 [MPFR](http://www.mpfr.org/) 
 Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 Grid optionally uses:
 [HDF5](https://support.hdfgroup.org/HDF5/)  
 [LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
 [FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
 LAPACK either generic version or Intel MKL library.
 ### Quick start
 First, start by cloning the repository:
@@ -122,10 +95,10 @@ install Grid. Other options are detailed in the next section, you can also use `
 `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
 customise the build.
-Finally, you can build, check, and install Grid:
+Finally, you can build and install Grid:
 ``` bash
-make; make check; make install
+make; make install
 ```
 To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
@@ -148,7 +121,7 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
 - `--enable-precision={single|double}`: set the default precision (default: `double`).
 - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
+- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
 - `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
@@ -162,6 +135,7 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
@@ -179,13 +153,13 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
+| `QPX`       | QPX (256 bit)                          |
 | `QPX`       | IBM QPX (256 bit)                      |
 Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNC`       | [Intel Xeon Phi codename Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
 | `BGQ`       | Blue Gene/Q                            |
@@ -202,205 +176,21 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
-             --enable-comms=mpi-auto  \
+             --enable-comms=mpi-auto \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
-If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi       \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=CC CC=cc
-```
+```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Knight's Landing with Intel Omnipath adapters with two adapters per node 
 presently performs better with use of more than one rank per node, using shared memory 
 for interior communication. This is the mpi3 communications implementation. 
 We recommend four ranks per node for best performance, but optimum is local volume dependent.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
 ```
 ### Build setup for Intel Haswell Xeon platform
 The following configuration is recommended for the Intel Haswell platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ```
        export I_MPI_PIN=1
 ```
 This is the default.
 ### Build setup for Intel Skylake Xeon platform
 The following configuration is recommended for the Intel Skylake platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ``` 
        export I_MPI_PIN=1
 ```
 This is the default. 
 #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 TBA
 ### Build setup for AMD EPYC / RYZEN
 The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
 are common. Each chip within the module exposes a separate NUMA domain.
 There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
 MPI-3 is recommended with the use of four ranks per socket,
 and 8 threads per rank. 
 The following configuration is recommended for the AMD EPYC platform.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
 This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
 It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
 shared memory to communicate within this node:
 mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
 Where omp_bind.sh does the following:
 ```
 #!/bin/bash
 numanode=` expr $PMI_RANK % 8 `
 basecore=`expr $numanode \* 16`
 core0=`expr $basecore + 0 `
 core1=`expr $basecore + 2 `
 core2=`expr $basecore + 4 `
 core3=`expr $basecore + 6 `
 core4=`expr $basecore + 8 `
 core5=`expr $basecore + 10 `
 core6=`expr $basecore + 12 `
 core7=`expr $basecore + 14 `
 export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
 echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 $@
 ```
 Performance:
 #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 TBA
 ### Build setup for BlueGene/Q
 To be written...
 ### Build setup for ARM Neon
 To be written...
 ### Build setup for laptops, other compilers, non-cluster builds
 Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
 and omit the enable-mkl flag. 
 Single node builds are enabled with 
 ```
            --enable-comms=none
 ```
 FFTW support that is not in the default search path may then enabled with
 ```
    --with-fftw=<installpath>
 ```
 BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
--- a/70
+++ b/70
@@ -1,35 +1,6 @@
 TODO:
 ---------------
 Large item work list:
 1)- BG/Q port and check ; Andrew says ok.
 2)- Christoph's local basis expansion Lanczos
 --
 3a)- RNG I/O in ILDG/SciDAC (minor)
 3b)- Precision conversion and sort out localConvert      <-- partial/easy
 3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 4)- Physical propagator interface
 5)- Conserved currents
 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 7)- HDCR resume
 Recent DONE 
 -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O ; <-- DONE ; bmark cori
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
 -- Merge high precision reduction into develop         <-- DONE
 -- BlockCG, BCGrQ                                      <-- DONE
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
   -- slice* linalg routines for multiRHS, BlockCG    
 -----
 * Forces; the UdSdU  term in gauge force term is half of what I think it should
  be. This is a consequence of taking ONLY the first term in:
@@ -50,8 +21,16 @@ Recent DONE
  This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two.
  This 2x is applied by hand in the fermion routines and in the Test_rect_force routine.
 Policies:
 * Link smearing/boundary conds; Policy class based implementation ; framework more in place
 * Support different boundary conditions (finite temp, chem. potential ... )
 * Support different fermion representations? 
  - contained entirely within the integrator presently
 - Sign of force term.
 - Reversibility test.
@@ -62,6 +41,11 @@ Recent DONE
 - Audit oIndex usage for cb behaviour
 - Rectangle gauge actions.
  Iwasaki,
  Symanzik,
  ... etc...
 - Prepare multigrid for HMC. - Alternate setup schemes.
 - Support for ILDG --- ugly, not done
@@ -71,11 +55,9 @@ Recent DONE
 - FFTnD ?
 - Gparity; hand opt use template specialisation elegance to enable the optimised paths ?
 - Gparity force term; Gparity (R)HMC.
-
+- Random number state save restore
 - Mobius implementation clean up to rmove #if 0 stale code sequences
 - CG -- profile carefully, kernel fusion, whole CG performance measurements.
 ================================================================
@@ -108,7 +90,6 @@ Insert/Extract
 Not sure of status of this -- reverify. Things are working nicely now though.
 * Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
    QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
    want to introduce a syntax that does not require this.
@@ -131,8 +112,6 @@ Not sure of status of this -- reverify. Things are working nicely now though.
 RECENT
 ---------------
  - Support different fermion representations? -- DONE
  - contained entirely within the integrator presently
  - Clean up HMC                                                             -- DONE
  - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE
  - Simplified the integrators a bit.                                        -- DONE
@@ -144,26 +123,6 @@ RECENT
  - Parallel io improvements                                  -- DONE
  - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE
 DONE:
 - MultiArray -- MultiRHS done
 - ConjugateGradientMultiShift -- DONE
 - MCR                         -- DONE
 - Remez -- Mike or Boost?     -- DONE
 - Proto (ET)                  -- DONE
 - uBlas                       -- DONE ; Eigen
 - Potentially Useful Boost libraries -- DONE ; Eigen
 - Aligned allocator; memory pool -- DONE
 - Multiprecision              -- DONE
 - Serialization               -- DONE
 - Regex -- Not needed
 - Tokenize -- Why?
 - Random number state save restore -- DONE
 - Rectangle gauge actions. -- DONE
  Iwasaki,
  Symanzik,
  ... etc...
 Done: Cayley, Partial , ContFrac force terms.
 DONE
@@ -248,7 +207,6 @@ Done
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
 ======================================================================================================
 * Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE
 * Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE
  user pass these? Is this a QCD specific?
--- a/9
+++ b/9
@@ -1,5 +1,6 @@
-Version : 0.7.0
+Version : 0.6.0
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
+- AVX512, AVX2, AVX, SSE good
- MPI and MPI3 comms optimisations for KNL and OPA finished
+- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
- Half precision comms
+- MPI and MPI3
 - HiRep, Smearing, Generic gauge group
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -1,800 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 };
 Gamma::Algebra Gmu [] = {
  Gamma::Algebra::GammaX,
  Gamma::Algebra::GammaY,
  Gamma::Algebra::GammaZ,
  Gamma::Algebra::GammaT
 };
 struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
  //  int HugePages;
 };
 class Benchmark {
 public:
  static void Decomposition (void ) {
    int threads = GridThread::GetThreads();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  static void Comms(void)
  {
    int Nloop=200;
    int nmu=0;
    int maxlat=32;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();
    for(int lat=4;lat<=maxlat;lat+=4){
      for(int Ls=8;Ls<=8;Ls*=2){
 	std::vector<int> latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
 	RealD ppn = Nrank/Nnode;
 	std::vector<HalfSpinColourVectorD *> xbuf(8);
 	std::vector<HalfSpinColourVectorD *> rbuf(8);
 	Grid.ShmBufferFreeAll();
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	int ncomm;
 	double dbytes;
 	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
 	  double start=usecond();
 	  dbytes=0;
 	  ncomm=0;
 	  parallel_for(int dir=0;dir<8;dir++){
 	    double tbytes;
 	    int mu =dir % 4;
 	    if (mpi_layout[mu]>1 ) {
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
 		int comm_proc=1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      } else { 
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 						 (void *)&rbuf[dir][0], recv_from_rank,
 						 bytes,dir);
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      ncomm++;
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      dbytes+=tbytes;
 	    }
 	  }
 	  Grid.Barrier();
 	  double stop=usecond();
 	  t_time[i] = stop-start; // microseconds
 	}
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
 	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    }
    }    
    return;
  }
  static void Memory(void)
  {
    const int Nvec=8;
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
    typedef iVector<vReal,Nvec> Vec;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    uint64_t NP;
    uint64_t NN;
  uint64_t lmax=48;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      NP= Grid.RankCount();
      NN =Grid.NodeCount();
      Vec rn ; random(sRNG,rn);
      LatticeVec z(&Grid); z=rn;
      LatticeVec x(&Grid); x=rn;
      LatticeVec y(&Grid); y=rn;
      double a=2.0;
      uint64_t Nloop=NLOOP;
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 	       << "\t\t"<< bytes/time/NN <<std::endl;
    }
  };
  static double DWF5(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (sFGrid); random(RNG5,src);
    LatticeFermion tmp   (sFGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    LatticeFermion r_eo  (sFGrid);
    LatticeFermion err   (sFGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 100;
 	uint64_t ncall = 1000;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	sFGrid->Barrier();
 	double t1=usecond();
 	sDw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	sFGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 	sDw.Report();
      }
      double robust = mflops_worst/mflops_best;;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
  static double DWF(int Ls,int L, double & robust)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (FGrid); random(RNG5,src);
    LatticeFermion ref   (FGrid);
    LatticeFermion tmp   (FGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    ////////////////////////////////////
    // Naive wilson implementation
    ////////////////////////////////////
    {
      LatticeGaugeField Umu5d(FGrid); 
      std::vector<LatticeColourMatrix> U(4,FGrid);
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 	}
      }
      ref = zero;
      for(int mu=0;mu<Nd;mu++){
 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
      }
      for(int mu=0;mu<Nd;mu++){
 	tmp = U[mu]*Cshift(src,mu+1,1);
 	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 	tmp =adj(U[mu])*src;
 	tmp =Cshift(tmp,mu+1,-1);
 	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      }
      ref = -0.5*ref;
    }
    LatticeFermion src_e (FrbGrid);
    LatticeFermion src_o (FrbGrid);
    LatticeFermion r_e   (FrbGrid);
    LatticeFermion r_o   (FrbGrid);
    LatticeFermion r_eo  (FGrid);
    LatticeFermion err   (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 200;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 1000;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	Dw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	Dw.Report();
 	Dw.DhopEO(src_o,r_e,DaggerNo);
 	Dw.DhopOE(src_e,r_o,DaggerNo);
 	setCheckerboard(r_eo,r_o);
 	setCheckerboard(r_eo,r_e);
 	err = r_eo-ref; 
 	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 	assert((norm2(err)<1.0e-4));
      }
      robust = mflops_worst/mflops_best;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 #endif
  Benchmark::Decomposition();
  int do_memory=1;
  int do_comms =1;
  int do_su3   =0;
  int do_wilson=1;
  int do_dwf   =1;
  if ( do_su3 ) {
    // empty for now
  }
 #if 1
  int sel=2;
  std::vector<int> L_list({8,12,16,24});
 #else
  int sel=1;
  std::vector<int> L_list({8,12});
 #endif
  int selm1=sel-1;
  std::vector<double> robust_list;
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> dwf5;
  if ( do_wilson ) {
    int Ls=1;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      double robust;
      wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
    }
  }
  int Ls=16;
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      double robust;
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
      dwf4.push_back(result);
      robust_list.push_back(robust);
    }
  }
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
    }
  }
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  int NN=NN_global;
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Memory();
  }
  if ( do_comms && (NN>1) ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Comms();
  }
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
  std::cout<<std::setprecision(3);
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -31,32 +31,6 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -66,21 +40,17 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  int Nloop=100;
+  int Nloop=10;
  int nmu=0;
  int maxlat=32;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+  int maxlat=16;
-    for(int Ls=8;Ls<=8;Ls*=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -88,23 +58,15 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      for(int i=0;i<Nloop;i++){
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
@@ -117,6 +79,7 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@@ -139,24 +102,18 @@ int main (int argc, char ** argv)
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
-	double stop=usecond();
+
 	t_time[i] = stop-start; // microseconds
      }
      double stop=usecond();
-      timestat.statistics(t_time);
+      double dbytes    = bytes;
-
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double dbytes    = bytes*ppn;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+      double time = stop-start; // microseconds
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
@@ -164,32 +121,25 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+
-    for(int Ls=8;Ls<=8;Ls*=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int i=0;i<Nloop;i++){
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@@ -228,37 +178,30 @@ int main (int argc, char ** argv)
 	  }
 	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
-      timestat.statistics(t_time);
+      double stop=usecond();
-      double dbytes    = bytes*ppn;
+      double dbytes    = bytes;
-      double xbytes    = dbytes*2.0*ncomm;
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-    std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+      double time = stop-start;
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-      
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }  
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+  for(int lat=4;lat<=maxlat;lat+=2){
-    for(int Ls=8;Ls<=8;Ls*=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -266,9 +209,6 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
@@ -276,115 +216,16 @@ int main (int argc, char ** argv)
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      double dbytes;
+      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	dbytes=0;
 	ncomm=0;
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu][0],
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
 					      bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    dbytes+=
 	      Grid.StencilSendToRecvFromBegin(requests,
 					      (void *)&xbuf[mu+4][0],
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
 					      bytes,mu+4);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@@ -396,64 +237,52 @@ int main (int argc, char ** argv)
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
+	    Grid.StencilSendToRecvFromBegin(requests,
-	      Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu][0],
-					      (void *)&xbuf[mu][0],
+					    xmit_to_rank,
-					      xmit_to_rank,
+					    (void *)&rbuf[mu][0],
-					      (void *)&rbuf[mu][0],
+					    recv_from_rank,
-					      recv_from_rank,
+					    bytes);
-					      bytes,mu);
+	
 	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
+	    Grid.StencilSendToRecvFromBegin(requests,
-	      Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu+4][0],
-					      (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
-					      xmit_to_rank,
+					    (void *)&rbuf[mu+4][0],
-					      (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
-					      recv_from_rank,
+					    bytes);
 					      bytes,mu+4);
 	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
 	}
 	Grid.StencilSendToRecvFromComplete(requests);
 	Grid.Barrier();
-	double stop=usecond();
+
 	t_time[i] = stop-start; // microseconds
      }
      double stop=usecond();
-      timestat.statistics(t_time);
+      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-      dbytes=dbytes*ppn;
+      double time = stop-start; // microseconds
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
-
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=maxlat;lat+=4){
+  for(int lat=4;lat<=maxlat;lat+=2){
-    for(int Ls=8;Ls<=8;Ls*=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -461,9 +290,6 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
@@ -471,71 +297,65 @@ int main (int argc, char ** argv)
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      double dbytes;
+
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
-	dbytes=0;
+
 	ncomm=0;
-
+	for(int mu=0;mu<4;mu++){
-	parallel_for(int dir=0;dir<8;dir++){
+	
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    if ( dir == mu ) { 
+	    
-	      int comm_proc=1;
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
-	    } else { 
+					    (void *)&xbuf[mu][0],
-	      int comm_proc = mpi_layout[mu]-1;
+					    xmit_to_rank,
-	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+					    (void *)&rbuf[mu][0],
-	    }
+					    recv_from_rank,
 					    bytes);
 	    //	    Grid.StencilSendToRecvFromComplete(requests);
 	    //	    requests.resize(0);
-	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+	    comm_proc = mpi_layout[mu]-1;
-					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
+	  
-
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-#pragma omp atomic
+	    Grid.StencilSendToRecvFromBegin(requests,
-	    dbytes+=tbytes;
+					    (void *)&xbuf[mu+4][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu+4][0],
 					    recv_from_rank,
 					    bytes);
 	    Grid.StencilSendToRecvFromComplete(requests);
 	    requests.resize(0);
 	  }
 	}
 	Grid.Barrier();
-	double stop=usecond();
+
 	t_time[i] = stop-start; // microseconds
      }
      double stop=usecond();
-      timestat.statistics(t_time);
+      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-      dbytes=dbytes*ppn;
+      double time = stop-start; // microseconds
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
-
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -1,22 +1,28 @@
- /*************************************************************************************
+    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
@@ -42,22 +48,16 @@ typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
-  int Ls=16;
+  const int Ls=8;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -71,66 +71,35 @@ int main (int argc, char ** argv)
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
-  
+
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
  LatticeFermion src   (FGrid); random(RNG5,src);
 #if 0
  src = zero;
  {
    std::vector<int> origin({0,0,0,latt4[2]-1,0});
    SpinColourVectorF tmp;
    tmp=zero;
    tmp()(0)(0)=Complex(-2.0,0.0);
    std::cout << " source site 0 " << tmp<<std::endl;
    pokeSite(tmp,src,origin);
  }
 #else
  RealD N2 = 1.0/::sqrt(norm2(src));
  src = src*N2;
 #endif
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeField Umu(UGrid); 
-  SU3::HotConfiguration(RNG4,Umu); 
+  random(RNG4,Umu);
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 #if 0
  Umu=1.0;
  for(int mu=0;mu<Nd;mu++){
    LatticeColourMatrix ttmp(UGrid);
    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
    //    if (mu !=2 ) ttmp = 0;
    //    ttmp = ttmp* pow(10.0,mu);
    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
  }
  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
 #endif
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  // replicate across fifth dimension
  LatticeGaugeField Umu5d(FGrid); 
-  std::vector<LatticeColourMatrix> U(4,FGrid);
+
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
  if (1)
  {
@@ -151,7 +120,8 @@ int main (int argc, char ** argv)
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
-  RealD NN = UGrid->NodeCount();
+
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
@@ -161,22 +131,15 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =100;
  int ncall =500;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
@@ -190,55 +153,16 @@ int main (int argc, char ** argv)
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    /*
    if(( norm2(err)>1.0e-4) ) { 
      std::cout << "RESULT\n " << result<<std::endl;
      std::cout << "REF   \n " << ref   <<std::endl;
      std::cout << "ERR   \n " << err   <<std::endl;
      FGrid->Barrier();
      exit(-1);
    }
    */
    assert (norm2(err)< 1.0e-4 );
    Dw.Report();
  }
  DomainWallFermionRL DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  if (1) {
    FGrid->Barrier();
    DwH.ZeroCounters();
    DwH.Dhop(src,result,0);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      DwH.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    assert (norm2(err)< 1.0e-3 );
    DwH.Report();
  }
  if (1)
  {
@@ -247,10 +171,6 @@ int main (int argc, char ** argv)
    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
@@ -262,13 +182,21 @@ int main (int argc, char ** argv)
    LatticeFermion sresult(sFGrid);
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
-
+  
-    localConvert(src,ssrc);
+    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
    FGrid->Barrier();
    sDw.Dhop(ssrc,sresult,0);
    sDw.ZeroCounters();
    double t0=usecond();
    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
@@ -282,53 +210,46 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    //    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
    sDw.Report();
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 	sDw.Dhop(ssrc,sresult,0);
 	PerformanceCounter Counter(i);
 	Counter.Start();
 	sDw.Dhop(ssrc,sresult,0);
 	Counter.Stop();
 	Counter.Report();
      }
    }
    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
    RealD sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVector normal, simd;
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
      if (norm2(normal-simd) > 1.0e-6 ) {
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl;
      }
    }}}}}
    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
    assert (sum< 1.0e-4 );
    err=zero;
    localConvert(sresult,err);
    err = err - ref;
    sum = norm2(err);
    std::cout<<GridLogMessage<<" difference between normal ref and simd is "<<sum<<std::endl;
    if(sum > 1.0e-4 ){
      std::cout<< "sD REF\n " <<ref << std::endl;
      std::cout<< "sD ERR   \n " <<err  <<std::endl;
    }
    //    assert(sum < 1.0e-4);
-    err=zero;
+    if (1) {
    localConvert(sresult,err);
    err = err - result;
    sum = norm2(err);
    std::cout<<GridLogMessage<<" difference between normal result and simd is "<<sum<<std::endl;
    if(sum > 1.0e-4 ){
      std::cout<< "sD REF\n " <<result << std::endl;
      std::cout<< "sD ERR   \n " << err  <<std::endl;
    }
    assert(sum < 1.0e-4);
    if(1){
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) 
 	std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) 
 	std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) 
 	std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
      LatticeFermion sr_e   (sFrbGrid);
@@ -336,30 +257,39 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,ssrc);
      pickCheckerboard(Odd,ssrc_o,ssrc);
-      //      setCheckerboard(sr_eo,ssrc_o);
+
-      //      setCheckerboard(sr_eo,ssrc_e);
+      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      sr_e = zero;
      sr_o = zero;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      FGrid->Barrier();
      sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      sDw.ZeroCounters();
-      //      sDw.stat.init("DhopEO");
+      sDw.stat.init("DhopEO");
      double t0=usecond();
      for (int i = 0; i < ncall; i++) {
        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
      FGrid->Barrier();
-      //      sDw.stat.print();
+      sDw.stat.print();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
      sDw.Report();
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@@ -368,43 +298,24 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      RealD error = norm2(ssrc_e);
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
-
+      if(error>1.0e-4) { 
      if(( error>1.0e-4) ) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
-	std::cout<< "DIFF\n " <<ssrc << std::endl;
+	std::cout<< ssrc << std::endl;
 	setCheckerboard(ssrc,sr_o);
 	setCheckerboard(ssrc,sr_e);
 	std::cout<< "CBRESULT\n " <<ssrc << std::endl;
 	std::cout<< "RESULT\n " <<sresult<< std::endl;
      }
      assert(error<1.0e-4);
    }
  if(0){
    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      sDw.Dhop(ssrc,sresult,0);
      PerformanceCounter Counter(i);
      Counter.Start();
      sDw.Dhop(ssrc,sresult,0);
      Counter.Stop();
      Counter.Report();
    }
  }
  }
  if (1)
  { // Naive wilson dag implementation
    ref = zero;
@@ -413,30 +324,25 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+  ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
  }
  //  dump=1;
  Dw.Dhop(src,result,1);
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
+  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
+  assert(norm2(err)<1.0e-4);
 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
 	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
 	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
  }
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@@ -444,24 +350,18 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);
-  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
@@ -469,7 +369,6 @@ int main (int argc, char ** argv)
  {
    Dw.ZeroCounters();
    FGrid->Barrier();
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
@@ -482,7 +381,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
@@ -498,20 +396,14 @@ int main (int argc, char ** argv)
  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
+  assert(norm2(err)<1.0e-4);
 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
 	std::cout<< "Deo REF\n " <<result  << std::endl;
 	std::cout<< "Deo ERR   \n " << err <<std::endl;
  }
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -1,190 +0,0 @@
 #include <Grid/Grid.h>
 #include <sstream>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
 typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int Ls=16;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "Ls = " << Ls << std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
  GparityLatticeFermionF src   (FGrid); random(RNG5,src);
  RealD N2 = 1.0/::sqrt(norm2(src));
  src = src*N2;
  GparityLatticeFermionF result(FGrid); result=zero;
  GparityLatticeFermionF    ref(FGrid);    ref=zero;
  GparityLatticeFermionF    tmp(FGrid);
  GparityLatticeFermionF    err(FGrid);
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
  LatticeGaugeFieldF Umu(UGrid); 
  SU3::HotConfiguration(RNG4,Umu); 
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  RealD NN = UGrid->NodeCount();
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 #endif
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* SINGLE/SINGLE"<<std::endl;
  GparityDomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  int ncall =1000;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
    Dw.Dhop(src,result,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=2*1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    Dw.Report();
  }
  std::cout << GridLogMessage<< "* SINGLE/HALF"<<std::endl;
  GparityDomainWallFermionFH DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  if (1) {
    FGrid->Barrier();
    DwH.ZeroCounters();
    DwH.Dhop(src,result,0);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      DwH.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=2*1344*volume*ncall;
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    DwH.Report();
  }
  GridCartesian         * UGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
  GridCartesian         * FGrid_d   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_d);
  GridRedBlackCartesian * FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_d);
  std::cout << GridLogMessage<< "* DOUBLE/DOUBLE"<<std::endl;
  GparityLatticeFermionD src_d(FGrid_d);
  precisionChange(src_d,src);
  LatticeGaugeFieldD Umu_d(UGrid_d); 
  precisionChange(Umu_d,Umu);
  GparityLatticeFermionD result_d(FGrid_d);
  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
  if (1) {
    FGrid_d->Barrier();
    DwD.ZeroCounters();
    DwD.Dhop(src_d,result_d,0);
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      DwD.Dhop(src_d,result_d,0);
      __SSC_STOP;
    }
    double t1=usecond();
    FGrid_d->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=2*1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
    DwD.Report();
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -66,8 +66,7 @@ int main (int argc, char ** argv)
    Vec tsum; tsum = zero;
-    GridParallelRNG          pRNG(&Grid);      
+    GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
    pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
    std::vector<double> stop(threads);
    Vector<Vec> sum(threads);
@@ -78,7 +77,8 @@ int main (int argc, char ** argv)
    }
    double start=usecond();
-    parallel_for(int t=0;t<threads;t++){
+PARALLEL_FOR_LOOP
    for(int t=0;t<threads;t++){
      sum[t] = x[t]._odata[0];
      for(int i=0;i<Nloop;i++){
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -55,21 +55,21 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=96;
+  uint64_t lmax=44;
-#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
+#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
-  for(int lat=8;lat<=lmax;lat+=8){
+  for(int lat=4;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      uint64_t Nloop=NLOOP;
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid);// random(pRNG,y);
+      LatticeVec y(&Grid); //random(pRNG,y);
      double a=2.0;
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3.0*vol*Nvec*sizeof(Real);
+      double bytes=3*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@@ -94,17 +94,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=8;lat<=lmax;lat+=8){
+  for(int lat=4;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid);// random(pRNG,y);
+      LatticeVec y(&Grid); //random(pRNG,y);
      double a=2.0;
      uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3.0*vol*Nvec*sizeof(Real);
+      double bytes=3*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@@ -129,20 +129,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
-  for(int lat=8;lat<=lmax;lat+=8){
+  for(int lat=4;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid);// random(pRNG,y);
+      LatticeVec y(&Grid); //random(pRNG,y);
      RealD a=2.0;
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=2.0*vol*Nvec*sizeof(Real);
+      double bytes=2*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
@@ -166,17 +166,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=8;lat<=lmax;lat+=8){
+  for(int lat=4;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec z(&Grid); //random(pRNG,z);
-      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec x(&Grid); //random(pRNG,x);
-      LatticeVec y(&Grid);// random(pRNG,y);
+      LatticeVec y(&Grid); //random(pRNG,y);
      RealD a=2.0;
      Real nn;      
      double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=1.0*vol*Nvec*sizeof(Real);
+      double bytes=vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -1,134 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_staggered.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian     RBGrid(&Grid);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
  std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
  typename ImprovedStaggeredFermionR::ImplParams params; 
  FermionField src   (&Grid); random(pRNG,src);
  FermionField result(&Grid); result=zero;
  FermionField    ref(&Grid);    ref=zero;
  FermionField    tmp(&Grid);    tmp=zero;
  FermionField    err(&Grid);    tmp=zero;
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);
  double volume=1;
  for(int mu=0;mu<Nd;mu++){
    volume=volume*latt_size[mu];
  }  
  // Only one non-zero (y)
 #if 0
  Umu=zero;
  Complex cone(1.0,0.0);
  for(int nn=0;nn<Nd;nn++){
    random(pRNG,U[nn]);
    if(1) {
      if (nn!=2) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
      //      else       { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
      else       { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; }
    }
    PokeIndex<LorentzIndex>(Umu,U[nn],nn);
  }
 #endif
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  ref = zero;
  /*  
  { // Naive wilson implementation
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
 	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
 	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
  }
  ref = -0.5*ref;
  */
  RealD mass=0.1;
  RealD c1=9.0/8.0;
  RealD c2=-1.0/24.0;
  RealD u0=1.0;
  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
  std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Ds.Dhop(src,result,0);
  }
  double t1=usecond();
  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -35,14 +35,13 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
 #define LMAX (64)
-  int64_t Nloop=20;
+  int Nloop=1000;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  int64_t threads = GridThread::GetThreads();
+  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -51,19 +50,19 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix z(&Grid);// random(pRNG,z);
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix x(&Grid);// random(pRNG,x);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix y(&Grid);// random(pRNG,y);
      double start=usecond();
-      for(int64_t i=0;i<Nloop;i++){
+      for(int i=0;i<Nloop;i++){
 	x=x*y;
      }
      double stop=usecond();
@@ -83,20 +82,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix y(&Grid); //random(pRNG,y);
      double start=usecond();
-      for(int64_t i=0;i<Nloop;i++){
+      for(int i=0;i<Nloop;i++){
 	z=x*y;
      }
      double stop=usecond();
@@ -114,20 +113,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix y(&Grid); //random(pRNG,y);
      double start=usecond();
-      for(int64_t i=0;i<Nloop;i++){
+      for(int i=0;i<Nloop;i++){
 	mult(z,x,y);
      }
      double stop=usecond();
@@ -145,20 +144,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=2;lat<=LMAX;lat+=2){
+  for(int lat=2;lat<=32;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix z(&Grid); //random(pRNG,z);
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix x(&Grid); //random(pRNG,x);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix y(&Grid); //random(pRNG,y);
      double start=usecond();
-      for(int64_t i=0;i<Nloop;i++){
+      for(int i=0;i<Nloop;i++){
 	mac(z,x,y);
      }
      double stop=usecond();
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
-  GridRedBlackCartesian     RBGrid(&Grid);
+  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
  std::vector<int> seeds({1,2,3,4});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);
-  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+  //  pRNG.SeedRandomDevice();
  LatticeFermion src   (&Grid); random(pRNG,src);
  LatticeFermion result(&Grid); result=zero;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 	  std::cout << latt_size.back() << "\t\t";
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
-	  GridRedBlackCartesian RBGrid(&Grid);
+	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -1,7 +1,11 @@
 include Make.inc
-bench-local: all
+simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
-	./Benchmark_su3
+
-	./Benchmark_memory_bandwidth
+EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
-	./Benchmark_wilson
+
-	./Benchmark_dwf --dslash-unroll
+libsimple_su3_test_a_SOURCES = simple_su3_test.cc
 libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
 libsimple_simd_test_a_SOURCES = simple_simd_test.cc
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
-EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'
+EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
--- a/configure.ac
+++ b/configure.ac
@@ -1,23 +1,16 @@
 AC_PREREQ([2.63])
-AC_INIT([Grid], [0.7.0], [https://github.com/paboyle/Grid], [Grid])
+AC_INIT([Grid], [0.6.0], [https://github.com/paboyle/Grid], [Grid])
 AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
-AM_INIT_AUTOMAKE([subdir-objects 1.13])
+AM_INIT_AUTOMAKE(subdir-objects)
 AM_EXTRA_RECURSIVE_TARGETS([tests bench])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 ################ Set flags
 # do not move!
 CXXFLAGS="-O3 $CXXFLAGS"
 ############### Checks for programs
 CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 AC_PROG_RANLIB
@@ -31,14 +24,12 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
 ############### Checks for typedefs, structures, and compiler characteristics
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
-############### OpenMP
+############### OpenMP 
 AC_OPENMP
 ac_openmp=no
 if test "${OPENMP_CXXFLAGS}X" != "X"; then
@@ -54,14 +45,9 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 ############## Standard libraries
 AC_CHECK_LIB([m],[cos])
 AC_CHECK_LIB([stdc++],[abort])
 ############### GMP and MPFR
 AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
@@ -74,23 +60,16 @@ AC_ARG_WITH([mpfr],
    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
-############### FFTW3
+############### FFTW3 
-AC_ARG_WITH([fftw],
+AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
            [try this for a non-standard install prefix of the FFTW3 library])],
            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
-############### LIME
+############### lapack 
 AC_ARG_WITH([lime],
            [AS_HELP_STRING([--with-lime=prefix],
            [try this for a non-standard install prefix of the LIME library])],
            [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
 ############### lapack
 AC_ARG_ENABLE([lapack],
-    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
+    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}], [ac_LAPACK=no])
 case ${ac_LAPACK} in
@@ -104,18 +83,6 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac
 ############### FP16 conversions
 AC_ARG_ENABLE([sfw-fp16],
    [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
    [ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes])
 case ${ac_SFW_FP16} in
    yes)
      AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);;
    no);;
    *)
      AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
 ############### MKL
 AC_ARG_ENABLE([mkl],
    [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
@@ -141,7 +108,7 @@ AC_ARG_WITH([hdf5],
 ############### first-touch
 AC_ARG_ENABLE([numa],
-    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
+    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
    [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
 case ${ac_NUMA} in
@@ -167,8 +134,8 @@ if test "${ac_MKL}x" != "nox"; then
 fi
 AC_SEARCH_LIBS([__gmpf_init], [gmp],
-               [AC_SEARCH_LIBS([mpfr_init], [mpfr],
+               [AC_SEARCH_LIBS([mpfr_init], [mpfr], 
-                               [AC_DEFINE([HAVE_LIBMPFR], [1],
+                               [AC_DEFINE([HAVE_LIBMPFR], [1], 
                                          [Define to 1 if you have the `MPFR' library])]
                               [have_mpfr=true], [AC_MSG_ERROR([MPFR library not found])])]
               [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library])]
@@ -177,7 +144,7 @@ AC_SEARCH_LIBS([__gmpf_init], [gmp],
 if test "${ac_LAPACK}x" != "nox"; then
    AC_SEARCH_LIBS([LAPACKE_sbdsdc], [lapack], [],
                   [AC_MSG_ERROR("LAPACK enabled but library not found")])
-fi
+fi   
 AC_SEARCH_LIBS([fftw_execute], [fftw3],
               [AC_SEARCH_LIBS([fftwf_execute], [fftw3f], [],
@@ -185,23 +152,6 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3],
               [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])]
               [have_fftw=true])
 AC_SEARCH_LIBS([limeCreateReader], [lime],
               [AC_DEFINE([HAVE_LIME], [1], [Define to 1 if you have the `LIME' library])]
               [have_lime=true],
 	       [AC_MSG_WARN(C-LIME library was not found in your system.
 In order to use ILGG file format please install or provide the correct path to your installation
 Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 AC_SEARCH_LIBS([crc32], [z],
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 AC_SEARCH_LIBS([move_pages], [numa],
               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
 	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
               [have_hdf5=true]
@@ -226,26 +176,19 @@ case ${ax_cv_cxx_compiler_vendor} in
    case ${ac_SIMD} in
      SSE4)
        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
-	case ${ac_SFW_FP16} in
+        SIMD_FLAGS='-msse4.2';;
 	  yes)
 	  SIMD_FLAGS='-msse4.2';;
 	  no)
 	  SIMD_FLAGS='-msse4.2 -mf16c';;
 	  *)
          AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);;
 	esac;;
      AVX)
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
-        SIMD_FLAGS='-mavx -mf16c';;
+        SIMD_FLAGS='-mavx';;
      AVXFMA4)
        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
-        SIMD_FLAGS='-mavx -mfma4 -mf16c';;
+        SIMD_FLAGS='-mavx -mfma4';;
      AVXFMA)
        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
-        SIMD_FLAGS='-mavx -mfma -mf16c';;
+        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
-        SIMD_FLAGS='-mavx2 -mfma -mf16c';;
+        SIMD_FLAGS='-mavx2 -mfma';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
@@ -254,7 +197,6 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-march=knl';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@@ -262,9 +204,6 @@ case ${ax_cv_cxx_compiler_vendor} in
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      NEONv8)
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
        SIMD_FLAGS='-march=armv8-a';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
@@ -293,7 +232,6 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@@ -331,41 +269,8 @@ case ${ac_PRECISION} in
     double)
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
     *)
     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
     ;;
 esac
 ######################  Shared memory allocation technique under MPI3
 AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 case ${ac_SHM} in
     shmget)
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;
     shmopen)
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
     ;;
     hugetlbfs)
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
     ;;
     *)
     AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
     ;;
 esac
 ######################  Shared base path for SHMMMAP
 AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
              [Select SHM mmap base path for hugetlbfs])],
 	      [ac_SHMPATH=${enable_shmpath}],
 	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
@@ -375,14 +280,14 @@ case ${ac_COMMS} in
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
     mpi3l*)
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
       comms_type='mpi3l'
     ;;
     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpit)
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
        comms_type='mpit'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
@@ -392,7 +297,7 @@ case ${ac_COMMS} in
        comms_type='shmem'
     ;;
     *)
-        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
+        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
 case ${ac_COMMS} in
@@ -410,13 +315,13 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\
 	            [Select Random Number Generator to be used])],\
-	            [ac_RNG=${enable_rng}],[ac_RNG=sitmo])
+	            [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
 case ${ac_RNG} in
     ranlux48)
@@ -429,7 +334,7 @@ case ${ac_RNG} in
      AC_DEFINE([RNG_SITMO],[1],[RNG_SITMO] )
     ;;
     *)
-      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
+      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
@@ -446,7 +351,7 @@ case ${ac_TIMERS} in
      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
+      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac
@@ -458,7 +363,7 @@ case ${ac_CHROMA} in
     yes|no)
     ;;
     *)
-       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
+       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
@@ -479,67 +384,12 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
 GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 GRID_LIBS=$LIBS
 GRID_SHORT_SHA=`git rev-parse --short HEAD`
 GRID_SHA=`git rev-parse HEAD`
 GRID_BRANCH=`git rev-parse --abbrev-ref HEAD`
 AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
 AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
 AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
 AC_SUBST([GRID_CXXFLAGS])
 AC_SUBST([GRID_LDFLAGS])
 AC_SUBST([GRID_LIBS])
 AC_SUBST([GRID_SHA])
 AC_SUBST([GRID_BRANCH])
 git_commit=`cd $srcdir && ./scripts/configure.commit`
 echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ----- GIT VERSION -------------------------------------
 $git_commit
 ----- PLATFORM ----------------------------------------
 architecture (build)        : $build_cpu
 os (build)                  : $build_os
 architecture (target)       : $target_cpu
 os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Communications type         : ${comms_type}
 Shared memory allocator     : ${ac_SHM}
 Shared memory mmap path     : ${ac_SHMPATH}
 Default precision           : ${ac_PRECISION}
 Software FP16 conversion    : ${ac_SFW_FP16}
 RNG choice                  : ${ac_RNG}
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 LIME (ILDG support)         : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi`
 HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LDFLAGS:
 `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LIBS:
 `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 -------------------------------------------------------" > grid.configure.summary
 GRID_SUMMARY="`cat grid.configure.summary`"
 AM_SUBST_NOTMAKE([GRID_SUMMARY])
 AC_SUBST([GRID_SUMMARY])
 AC_CONFIG_FILES([grid-config], [chmod +x grid-config])
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
@@ -550,15 +400,42 @@ AC_CONFIG_FILES(tests/forces/Makefile)
 AC_CONFIG_FILES(tests/hadrons/Makefile)
 AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(extras/Makefile)
 AC_CONFIG_FILES(extras/Hadrons/Makefile)
 AC_OUTPUT
-echo ""
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-cat grid.configure.summary
+Summary of configuration for $PACKAGE v$VERSION
-echo ""
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ----- PLATFORM ----------------------------------------
 architecture (build)        : $build_cpu
 os (build)                  : $build_os
 architecture (target)       : $target_cpu
 os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp} 
 Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
 RNG choice                  : ${ac_RNG} 
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LDFLAGS:
 `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 LIBS:
 `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/    -/g'`
 -------------------------------------------------------" > config.summary
 echo ""
 cat config.summary
 echo ""
--- a/extras/Hadrons/Application.cc
+++ b/extras/Hadrons/Application.cc
@@ -162,8 +162,7 @@ void Application::saveParameterFile(const std::string parameterFileName)
 sizeString((size)*locVol_) << " (" << sizeString(size)  << "/site)"
 #define DEFINE_MEMPEAK \
-GeneticScheduler<unsigned int>::ObjFunc memPeak = \
+auto memPeak = [this](const std::vector<unsigned int> &program)\
 [this](const std::vector<unsigned int> &program)\
 {\
    unsigned int memPeak;\
    bool         msg;\
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -41,10 +41,9 @@ using namespace Hadrons;
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
-    dim_ = GridDefaultLatt();
+    nd_ = GridDefaultLatt().size();
    nd_  = dim_.size();
    grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
-        dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
+        GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
        GridDefaultMpi()));
    gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
    auto loc = getGrid()->LocalDimensions();
@@ -133,16 +132,6 @@ unsigned int Environment::getNd(void) const
    return nd_;
 }
 std::vector<int> Environment::getDim(void) const
 {
    return dim_;
 }
 int Environment::getDim(const unsigned int mu) const
 {
    return dim_[mu];
 }
 // random number generator /////////////////////////////////////////////////////
 void Environment::setSeed(const std::vector<int> &seed)
 {
@@ -282,21 +271,6 @@ std::string Environment::getModuleType(const std::string name) const
    return getModuleType(getModuleAddress(name));
 }
 std::string Environment::getModuleNamespace(const unsigned int address) const
 {
    std::string type = getModuleType(address), ns;
    auto pos2 = type.rfind("::");
    auto pos1 = type.rfind("::", pos2 - 2);
    return type.substr(pos1 + 2, pos2 - pos1 - 2);
 }
 std::string Environment::getModuleNamespace(const std::string name) const
 {
    return getModuleNamespace(getModuleAddress(name));
 }
 bool Environment::hasModule(const unsigned int address) const
 {
    return (address < module_.size());
@@ -518,14 +492,7 @@ std::string Environment::getObjectType(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
    {
-        if (object_[address].type)
+        return typeName(object_[address].type);
        {
            return typeName(object_[address].type);
        }
        else
        {
            return "<no type>";
        }
    }
    else if (hasObject(address))
    {
@@ -565,23 +532,6 @@ Environment::Size Environment::getObjectSize(const std::string name) const
    return getObjectSize(getObjectAddress(name));
 }
 unsigned int Environment::getObjectModule(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].module;
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 unsigned int Environment::getObjectModule(const std::string name) const
 {
    return getObjectModule(getObjectAddress(name));
 }
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@@ -106,8 +106,6 @@ public:
    void                    createGrid(const unsigned int Ls);
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
    // random number generator
    void                    setSeed(const std::vector<int> &seed);
@@ -133,8 +131,6 @@ public:
    std::string             getModuleName(const unsigned int address) const;
    std::string             getModuleType(const unsigned int address) const;
    std::string             getModuleType(const std::string name) const;
    std::string             getModuleNamespace(const unsigned int address) const;
    std::string             getModuleNamespace(const std::string name) const;
    bool                    hasModule(const unsigned int address) const;
    bool                    hasModule(const std::string name) const;
    Graph<unsigned int>     makeModuleGraph(void) const;
@@ -175,8 +171,6 @@ public:
    std::string             getObjectType(const std::string name) const;
    Size                    getObjectSize(const unsigned int address) const;
    Size                    getObjectSize(const std::string name) const;
    unsigned int            getObjectModule(const unsigned int address) const;
    unsigned int            getObjectModule(const std::string name) const;
    unsigned int            getObjectLs(const unsigned int address) const;
    unsigned int            getObjectLs(const std::string name) const;
    bool                    hasObject(const unsigned int address) const;
@@ -187,10 +181,6 @@ public:
    bool                    hasCreatedObject(const std::string name) const;
    bool                    isObject5d(const unsigned int address) const;
    bool                    isObject5d(const std::string name) const;
    template <typename T>
    bool                    isObjectOfType(const unsigned int address) const;
    template <typename T>
    bool                    isObjectOfType(const std::string name) const;
    Environment::Size       getTotalSize(void) const;
    void                    addOwnership(const unsigned int owner,
                                         const unsigned int property);
@@ -207,7 +197,6 @@ private:
    bool                                   dryRun_{false};
    unsigned int                           traj_, locVol_;
    // grids
    std::vector<int>                       dim_;
    GridPt                                 grid4d_;
    std::map<unsigned int, GridPt>         grid5d_;
    GridRbPt                               gridRb4d_;
@@ -354,7 +343,7 @@ T * Environment::getObject(const unsigned int address) const
        else
        {
            HADRON_ERROR("object with address " + std::to_string(address) +
-                         " does not have type '" + typeName(&typeid(T)) +
+                         " does not have type '" + typeid(T).name() +
                         "' (has type '" + getObjectType(address) + "')");
        }
    }
@@ -391,37 +380,6 @@ T * Environment::createLattice(const std::string name)
    return createLattice<T>(getObjectAddress(name));
 }
 template <typename T>
 bool Environment::isObjectOfType(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
    {
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address) +
                     " exists but is not registered");
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 template <typename T>
 bool Environment::isObjectOfType(const std::string name) const
 {
    return isObjectOfType<T>(getObjectAddress(name));
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Environment_hpp_
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -51,43 +51,23 @@ using Grid::operator<<;
 * error with GCC 5 (clang & GCC 6 compile fine without it).
 */
 // FIXME: find a way to do that in a more general fashion
 #ifndef FIMPL
 #define FIMPL WilsonImplR
 #endif
 #ifndef SIMPL
 #define SIMPL ScalarImplCR
 #endif
 BEGIN_HADRONS_NAMESPACE
 // type aliases
-#define FERM_TYPE_ALIASES(FImpl, suffix)\
+#define TYPE_ALIASES(FImpl, suffix)\
 typedef FermionOperator<FImpl>                       FMat##suffix;             \
 typedef typename FImpl::FermionField                 FermionField##suffix;     \
 typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \
 typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
-typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \
+typedef typename FImpl::DoubledGaugeField            DoubledGaugeField##suffix;\
-                                                     SlicedPropagator##suffix;
+typedef std::function<void(FermionField##suffix &,                             \
 #define GAUGE_TYPE_ALIASES(FImpl, suffix)\
 typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
 #define SCALAR_TYPE_ALIASES(SImpl, suffix)\
 typedef typename SImpl::Field ScalarField##suffix;\
 typedef typename SImpl::Field PropagatorField##suffix;
 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef std::function<void(FermionField##suffix &,\
                      const FermionField##suffix &)> SolverFn##suffix;
 #define SINK_TYPE_ALIASES(suffix)\
 typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
 #define FGS_TYPE_ALIASES(FImpl, suffix)\
 FERM_TYPE_ALIASES(FImpl, suffix)\
 GAUGE_TYPE_ALIASES(FImpl, suffix)\
 SOLVER_TYPE_ALIASES(FImpl, suffix)
 // logger
 class HadronsLogger: public Logger
 {
@@ -165,15 +145,6 @@ std::string typeName(void)
    return typeName(typeIdPt<T>());
 }
 // default writers/readers
 #ifdef HAVE_HDF5
 typedef Hdf5Reader CorrReader;
 typedef Hdf5Writer CorrWriter;
 #else
 typedef XmlReader CorrReader;
 typedef XmlWriter CorrWriter;
 #endif
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Global_hpp_
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -1,25 +1,40 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 #include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 #include <Grid/Hadrons/Modules/Quark.hpp>
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/extras/Hadrons/Modules/MAction/DWF.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MAction_DWF_hpp_
+#ifndef Hadrons_DWF_hpp_
-#define Hadrons_MAction_DWF_hpp_
+#define Hadrons_DWF_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -48,15 +48,14 @@ public:
                                    std::string, gauge,
                                    unsigned int, Ls,
                                    double      , mass,
-                                    double      , M5,
+                                    double      , M5);
                                    std::string , boundary);
 };
 template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TDWF(const std::string name);
@@ -117,19 +116,14 @@ void TDWF<FImpl>::execute(void)
                 << par().mass << ", M5= " << par().M5 << " and Ls= "
                 << par().Ls << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    env().createGrid(par().Ls);
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &g4     = *env().getGrid();
    auto &grb4   = *env().getRbGrid();
    auto &g5     = *env().getGrid(par().Ls);
    auto &grb5   = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new DomainWallFermion<FImpl>(U, g5, grb5, g4, grb4,
-                                                par().mass, par().M5,
+                                                par().mass, par().M5);
                                                implParams);
    env().setObject(getName(), fMatPt);
 }
@@ -137,4 +131,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MAction_DWF_hpp_
+#endif // Hadrons_DWF_hpp_
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/extras/Hadrons/Modules/MAction/Wilson.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MAction_Wilson_hpp_
+#ifndef Hadrons_Wilson_hpp_
-#define Hadrons_MAction_Wilson_hpp_
+#define Hadrons_Wilson_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -46,15 +46,14 @@ class WilsonPar: Serializable
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
-                                    double     , mass,
+                                    double     , mass);
                                    std::string, boundary);
 };
 template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilson(const std::string name);
@@ -113,15 +112,10 @@ void TWilson<FImpl>::execute()
 {
    LOG(Message) << "Setting up TWilson fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
+    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass,
                                            implParams);
    env().setObject(getName(), fMatPt);
 }
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MContraction_Baryon_hpp_
+#ifndef Hadrons_Baryon_hpp_
-#define Hadrons_MContraction_Baryon_hpp_
+#define Hadrons_Baryon_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TBaryon: public Module<BaryonPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl1, 1);
+    TYPE_ALIASES(FImpl1, 1);
-    FERM_TYPE_ALIASES(FImpl2, 2);
+    TYPE_ALIASES(FImpl2, 2);
-    FERM_TYPE_ALIASES(FImpl3, 3);
+    TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
@@ -112,7 +112,7 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
                 << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
                 << par().q3 << "'" << std::endl;
-    CorrWriter             writer(par().output);
+    XmlWriter             writer(par().output);
    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
    PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q2);
@@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
    // FIXME: do contractions
-    // write(writer, "meson", result);
+    write(writer, "meson", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MContraction_Baryon_hpp_
+#endif // Hadrons_Baryon_hpp_
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@@ -1,144 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/DiscLoop.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_DiscLoop_hpp_
 #define Hadrons_MContraction_DiscLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                DiscLoop                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class DiscLoopPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(DiscLoopPar,
                                    std::string,    q_loop,
                                    Gamma::Algebra, gamma,
                                    std::string,    output);
 };
 template <typename FImpl>
 class TDiscLoop: public Module<DiscLoopPar>
 {
    FERM_TYPE_ALIASES(FImpl,);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        Gamma::Algebra, gamma,
                                        std::vector<Complex>, corr);
    };
 public:
    // constructor
    TDiscLoop(const std::string name);
    // destructor
    virtual ~TDiscLoop(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(DiscLoop, TDiscLoop<FIMPL>, MContraction);
 /******************************************************************************
 *                       TDiscLoop implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TDiscLoop<FImpl>::TDiscLoop(const std::string name)
 : Module<DiscLoopPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TDiscLoop<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q_loop};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TDiscLoop<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDiscLoop<FImpl>::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TDiscLoop<FImpl>::execute(void)
 {
    LOG(Message) << "Computing disconnected loop contraction '" << getName() 
                 << "' using '" << par().q_loop << "' with " << par().gamma 
                 << " insertion." << std::endl;
    CorrWriter            writer(par().output);
    PropagatorField       &q_loop = *env().template getObject<PropagatorField>(par().q_loop);
    LatticeComplex        c(env().getGrid());
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
    Result                result;
    c = trace(gamma*q_loop);
    sliceSum(c, buf, Tp);
    result.gamma = par().gamma;
    result.corr.resize(buf.size());
    for (unsigned int t = 0; t < buf.size(); ++t)
    {
        result.corr[t] = TensorRemove(buf[t]);
    }
    write(writer, "disc", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_DiscLoop_hpp_
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -1,170 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_Gamma3pt_hpp_
 #define Hadrons_MContraction_Gamma3pt_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 * 3pt contraction with gamma matrix insertion.
 *
 * Schematic:
 *
 *             q2           q3
 *        /----<------*------<----¬
 *       /          gamma          \
 *      /                           \
 *   i *                            * f
 *      \                          /
 *       \                        /
 *        \----------->----------/
 *                   q1
 *
 *      trace(g5*q1*adj(q2)*g5*gamma*q3)
 */
 /******************************************************************************
 *                               Gamma3pt                                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class Gamma3ptPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(Gamma3ptPar,
                                    std::string,    q1,
                                    std::string,    q2,
                                    std::string,    q3,
                                    Gamma::Algebra, gamma,
                                    std::string,    output);
 };
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TGamma3pt: public Module<Gamma3ptPar>
 {
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
    FERM_TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        Gamma::Algebra, gamma,
                                        std::vector<Complex>, corr);
    };
 public:
    // constructor
    TGamma3pt(const std::string name);
    // destructor
    virtual ~TGamma3pt(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Gamma3pt, ARG(TGamma3pt<FIMPL, FIMPL, FIMPL>), MContraction);
 /******************************************************************************
 *                       TGamma3pt implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 TGamma3pt<FImpl1, FImpl2, FImpl3>::TGamma3pt(const std::string name)
 : Module<Gamma3ptPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3};
    return in;
 }
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
 {
    LOG(Message) << "Computing 3pt contractions '" << getName() << "' using"
                 << " quarks '" << par().q1 << "', '" << par().q2 << "' and '"
                 << par().q3 << "', with " << par().gamma << " insertion." 
                 << std::endl;
    CorrWriter            writer(par().output);
    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
    PropagatorField3      &q3 = *env().template getObject<PropagatorField3>(par().q3);
    LatticeComplex        c(env().getGrid());
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma                 gamma(par().gamma);
    std::vector<TComplex> buf;
    Result                result;
    c = trace(g5*q1*adj(q2)*(g5*gamma)*q3);
    sliceSum(c, buf, Tp);
    result.gamma = par().gamma;
    result.corr.resize(buf.size());
    for (unsigned int t = 0; t < buf.size(); ++t)
    {
        result.corr[t] = TensorRemove(buf[t]);
    }
    write(writer, "gamma3pt", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_Gamma3pt_hpp_
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@@ -6,10 +6,8 @@ Source file: extras/Hadrons/Modules/MContraction/Meson.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Copyright (C) 2017
 Author: Antonin Portelli <antonin.portelli@me.com>
        Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MContraction_Meson_hpp_
+#ifndef Hadrons_Meson_hpp_
-#define Hadrons_MContraction_Meson_hpp_
+#define Hadrons_Meson_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -38,56 +36,32 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 /*
 Meson contractions
 -----------------------------
 * options:
 - q1: input propagator 1 (string)
 - q2: input propagator 2 (string)
 - gammas: gamma products to insert at sink & source, pairs of gamma matrices 
           (space-separated strings) in angled brackets (i.e. <g_sink g_src>),
           in a sequence (e.g. "<Gamma5 Gamma5><Gamma5 GammaT>").
           Special values: "all" - perform all possible contractions.
 - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0."),
        given as multiples of (2*pi) / L.
 */
 /******************************************************************************
 *                                TMeson                                       *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 typedef std::pair<Gamma::Algebra, Gamma::Algebra> GammaPair;
 class MesonPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(MesonPar,
-                                    std::string, q1,
+                                    std::string,    q1,
-                                    std::string, q2,
+                                    std::string,    q2,
-                                    std::string, gammas,
+                                    std::string,    output,
-                                    std::string, sink,
+                                    Gamma::Algebra, gammaSource,
-                                    std::string, output);
+                                    Gamma::Algebra, gammaSink);
 };
 template <typename FImpl1, typename FImpl2>
 class TMeson: public Module<MesonPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl1, 1);
+    TYPE_ALIASES(FImpl1, 1);
-    FERM_TYPE_ALIASES(FImpl2, 2);
+    TYPE_ALIASES(FImpl2, 2);
    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
    SINK_TYPE_ALIASES(Scalar);
    class Result: Serializable
    {
    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result, std::vector<Complex>, corr);
                                        Gamma::Algebra, gamma_snk,
                                        Gamma::Algebra, gamma_src,
                                        std::vector<Complex>, corr);
    };
 public:
    // constructor
@@ -97,7 +71,6 @@ public:
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    virtual void parseGammaString(std::vector<GammaPair> &gammaList);
    // execution
    virtual void execute(void);
 };
@@ -117,7 +90,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name)
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
 {
-    std::vector<std::string> input = {par().q1, par().q2, par().sink};
+    std::vector<std::string> input = {par().q1, par().q2};
    return input;
 }
@@ -130,35 +103,7 @@ std::vector<std::string> TMeson<FImpl1, FImpl2>::getOutput(void)
    return output;
 }
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 {
    gammaList.clear();
    // Determine gamma matrices to insert at source/sink.
    if (par().gammas.compare("all") == 0)
    {
        // Do all contractions.
        for (unsigned int i = 1; i < Gamma::nGamma; i += 2)
        {
            for (unsigned int j = 1; j < Gamma::nGamma; j += 2)
            {
                gammaList.push_back(std::make_pair((Gamma::Algebra)i, 
                                                   (Gamma::Algebra)j));
            }
        }
    }
    else
    {
        // Parse individual contractions from input string.
        gammaList = strToVec<GammaPair>(par().gammas);
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
 #define mesonConnected(q1, q2, gSnk, gSrc) \
 (g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::execute(void)
 {
@@ -166,73 +111,21 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                 << std::endl;
-    CorrWriter             writer(par().output);
+    XmlWriter             writer(par().output);
-    std::vector<TComplex>  buf;
+    PropagatorField1      &q1 = *env().template getObject<PropagatorField1>(par().q1);
-    std::vector<Result>    result;
+    PropagatorField2      &q2 = *env().template getObject<PropagatorField2>(par().q2);
-    Gamma                  g5(Gamma::Algebra::Gamma5);
+    LatticeComplex        c(env().getGrid());
-    std::vector<GammaPair> gammaList;
+    Gamma                 gSrc(par().gammaSource), gSnk(par().gammaSink);
-    int                    nt = env().getDim(Tp);
+    Gamma                 g5(Gamma::Algebra::Gamma5);
    std::vector<TComplex> buf;
    Result                result;
-    parseGammaString(gammaList);
+    c = trace(gSnk*q1*adj(gSrc)*g5*adj(q2)*g5);
-    result.resize(gammaList.size());
+    sliceSum(c, buf, Tp);
-    for (unsigned int i = 0; i < result.size(); ++i)
+    result.corr.resize(buf.size());
    for (unsigned int t = 0; t < buf.size(); ++t)
    {
-        result[i].gamma_snk = gammaList[i].first;
+        result.corr[t] = TensorRemove(buf[t]);
        result[i].gamma_src = gammaList[i].second;
        result[i].corr.resize(nt);
    }
    if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
        env().template isObjectOfType<SlicedPropagator2>(par().q2))
    {
        SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
        SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
        LOG(Message) << "(propagator already sinked)" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
            Gamma gSnk(gammaList[i].first);
            Gamma gSrc(gammaList[i].second);
            for (unsigned int t = 0; t < buf.size(); ++t)
            {
                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
            }
        }
    }
    else
    {
        PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1);
        PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2);
        LatticeComplex   c(env().getGrid());
        LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
            Gamma       gSnk(gammaList[i].first);
            Gamma       gSrc(gammaList[i].second);
            std::string ns;
            ns = env().getModuleNamespace(env().getObjectModule(par().sink));
            if (ns == "MSource")
            {
                PropagatorField1 &sink =
                    *env().template getObject<PropagatorField1>(par().sink);
                c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
                sliceSum(c, buf, Tp);
            }
            else if (ns == "MSink")
            {
                SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
                c   = trace(mesonConnected(q1, q2, gSnk, gSrc));
                buf = sink(c);
            }
            for (unsigned int t = 0; t < buf.size(); ++t)
            {
                result[i].corr[t] = TensorRemove(buf[t]);
            }
        }
    }
    write(writer, "meson", result);
 }
@@ -241,4 +134,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MContraction_Meson_hpp_
+#endif // Hadrons_Meson_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@@ -1,114 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
 #define Hadrons_MContraction_WeakHamiltonian_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonian                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 /*******************************************************************************
 * Utilities for contractions involving the Weak Hamiltonian.
 ******************************************************************************/
 //// Sum and store correlator.
 #define MAKE_DIAG(exp, buf, res, n)\
 sliceSum(exp, buf, Tp);\
 res.name = (n);\
 res.corr.resize(buf.size());\
 for (unsigned int t = 0; t < buf.size(); ++t)\
 {\
    res.corr[t] = TensorRemove(buf[t]);\
 }
 //// Contraction of mu index: use 'mu' variable in exp.
 #define SUM_MU(buf,exp)\
 buf = zero;\
 for (unsigned int mu = 0; mu < ndim; ++mu)\
 {\
    buf += exp;\
 }
 enum 
 {
  i_V = 0,
  i_A = 1,
  n_i = 2
 };
 class WeakHamiltonianPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WeakHamiltonianPar,
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, q3,
                                    std::string, q4,
                                    std::string, output);
 };
 #define MAKE_WEAK_MODULE(modname)\
 class T##modname: public Module<WeakHamiltonianPar>\
 {\
 public:\
    FERM_TYPE_ALIASES(FIMPL,)\
    class Result: Serializable\
    {\
    public:\
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,\
                                        std::string, name,\
                                        std::vector<Complex>, corr);\
    };\
 public:\
    /* constructor */ \
    T##modname(const std::string name);\
    /* destructor */ \
    virtual ~T##modname(void) = default;\
    /* dependency relation */ \
    virtual std::vector<std::string> getInput(void);\
    virtual std::vector<std::string> getOutput(void);\
    /* setup */ \
    virtual void setup(void);\
    /* execution */ \
    virtual void execute(void);\
    std::vector<std::string> VA_label = {"V", "A"};\
 };\
 MODULE_REGISTER_NS(modname, T##modname, MContraction);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakHamiltonian_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
@@ -1,137 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian current-current contractions, Eye-type.
 * 
 * These contractions are generated by the Q1 and Q2 operators in the physical
 * basis (see e.g. Fig 3 of arXiv:1507.03094).
 * 
 * Schematics:        q4                 |                  
 *                  /-<-¬                |                             
 *                 /     \               |             q2           q3
 *                 \     /               |        /----<------*------<----¬                        
 *            q2    \   /    q3          |       /          /-*-¬          \
 *       /-----<-----* *-----<----¬      |      /          /     \          \
 *    i *            H_W           * f   |   i *           \     /  q4      * f
 *       \                        /      |      \           \->-/          /   
 *        \                      /       |       \                        /       
 *         \---------->---------/        |        \----------->----------/        
 *                   q1                  |                   q1                  
 *                                       |
 *                Saucer (S)             |                  Eye (E)
 * 
 * S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2])
 * E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2])
 */
 /******************************************************************************
 *                  TWeakHamiltonianEye implementation                        *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakHamiltonianEye::TWeakHamiltonianEye(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakHamiltonianEye::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakHamiltonianEye::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian (Eye type) contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    CorrWriter             writer(par().output);
    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
    Gamma g5            = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_eye_diag);
    unsigned int ndim   = env().getNd();
    PropagatorField              tmp1(env().getGrid());
    LatticeComplex               tmp2(env().getGrid());
    std::vector<PropagatorField> S_body(ndim, tmp1);
    std::vector<PropagatorField> S_loop(ndim, tmp1);
    std::vector<LatticeComplex>  E_body(ndim, tmp2);
    std::vector<LatticeComplex>  E_loop(ndim, tmp2);
    // Setup for S-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        S_body[mu] = MAKE_SE_BODY(q1, q2, q3, GammaL(Gamma::gmu[mu]));
        S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
    // Perform S-type contractions.    
    SUM_MU(expbuf, trace(S_body[mu]*S_loop[mu]))
    MAKE_DIAG(expbuf, corrbuf, result[S_diag], "HW_S")
    // Recycle sub-expressions for E-type contractions.
    for (unsigned int mu = 0; mu < ndim; ++mu)
    {
        E_body[mu] = trace(S_body[mu]);
        E_loop[mu] = trace(S_loop[mu]);
    }
    // Perform E-type contractions.
    SUM_MU(expbuf, E_body[mu]*E_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
    write(writer, "HW_Eye", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
@@ -1,58 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonianEye                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    S_diag = 0,
    E_diag = 1,
    n_eye_diag = 2
 };
 // Saucer and Eye subdiagram contractions.
 #define MAKE_SE_BODY(Q_1, Q_2, Q_3, gamma) (Q_3*g5*Q_1*adj(Q_2)*g5*gamma)
 #define MAKE_SE_LOOP(Q_loop, gamma) (Q_loop*gamma)
 MAKE_WEAK_MODULE(WeakHamiltonianEye)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
@@ -1,139 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian current-current contractions, Non-Eye-type.
 * 
 * These contractions are generated by the Q1 and Q2 operators in the physical
 * basis (see e.g. Fig 3 of arXiv:1507.03094).
 * 
 * Schematic:     
 *            q2             q3          |           q2              q3
 *          /--<--¬       /--<--¬        |        /--<--¬         /--<--¬       
 *         /       \     /       \       |       /       \       /       \      
 *        /         \   /         \      |      /         \     /         \     
 *       /           \ /           \     |     /           \   /           \    
 *    i *             * H_W         *  f |  i *             * * H_W         * f 
 *      \             *             |    |     \           /   \           /
 *       \           / \           /     |      \         /     \         /    
 *        \         /   \         /      |       \       /       \       /  
 *         \       /     \       /       |        \-->--/         \-->--/      
 *          \-->--/       \-->--/        |          q1               q4 
 *            q1             q4          |
 *                Connected (C)          |                 Wing (W)
 *
 * C: trace(q1*adj(q2)*g5*gL[mu]*q3*adj(q4)*g5*gL[mu])
 * W: trace(q1*adj(q2)*g5*gL[mu])*trace(q3*adj(q4)*g5*gL[mu])
 * 
 */
 /******************************************************************************
 *                  TWeakHamiltonianNonEye implementation                     *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakHamiltonianNonEye::TWeakHamiltonianNonEye(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakHamiltonianNonEye::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakHamiltonianNonEye::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian (Non-Eye type) contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    CorrWriter             writer(par().output);
    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
    Gamma g5            = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_noneye_diag); 
    unsigned int ndim   = env().getNd();
    PropagatorField              tmp1(env().getGrid());
    LatticeComplex               tmp2(env().getGrid());
    std::vector<PropagatorField> C_i_side_loop(ndim, tmp1);
    std::vector<PropagatorField> C_f_side_loop(ndim, tmp1);
    std::vector<LatticeComplex>  W_i_side_loop(ndim, tmp2);
    std::vector<LatticeComplex>  W_f_side_loop(ndim, tmp2);
    // Setup for C-type contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        C_i_side_loop[mu] = MAKE_CW_SUBDIAG(q1, q2, GammaL(Gamma::gmu[mu]));
        C_f_side_loop[mu] = MAKE_CW_SUBDIAG(q3, q4, GammaL(Gamma::gmu[mu]));
    }
    // Perform C-type contractions.    
    SUM_MU(expbuf, trace(C_i_side_loop[mu]*C_f_side_loop[mu]))
    MAKE_DIAG(expbuf, corrbuf, result[C_diag], "HW_C")
    // Recycle sub-expressions for W-type contractions.
    for (unsigned int mu = 0; mu < ndim; ++mu)
    {
        W_i_side_loop[mu] = trace(C_i_side_loop[mu]);
        W_f_side_loop[mu] = trace(C_f_side_loop[mu]);
    }
    // Perform W-type contractions.
    SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
    MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
    write(writer, "HW_NonEye", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
@@ -1,57 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakHamiltonianNonEye                              *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    W_diag = 0,
    C_diag = 1,
    n_noneye_diag = 2
 };
 // Wing and Connected subdiagram contractions
 #define MAKE_CW_SUBDIAG(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
 MAKE_WEAK_MODULE(WeakHamiltonianNonEye)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
@@ -1,135 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 /*
 * Weak Hamiltonian + current contractions, disconnected topology for neutral 
 * mesons.
 * 
 * These contractions are generated by operators Q_1,...,10 of the dS=1 Weak
 * Hamiltonian in the physical basis and an additional current J (see e.g. 
 * Fig 11 of arXiv:1507.03094).
 * 
 * Schematic:
 *                        
 *           q2          q4             q3
 *       /--<--¬     /---<--¬       /---<--¬
 *     /         \ /         \     /        \
 *  i *           * H_W      |  J *          * f
 *     \         / \         /     \        /
 *      \--->---/   \-------/       \------/
 *          q1 
 * 
 * options
 * - q1: input propagator 1 (string)
 * - q2: input propagator 2 (string)
 * - q3: input propagator 3 (string), assumed to be sequential propagator 
 * - q4: input propagator 4 (string), assumed to be a loop
 * 
 * type 1: trace(q1*adj(q2)*g5*gL[mu])*trace(loop*gL[mu])*trace(q3*g5)
 * type 2: trace(q1*adj(q2)*g5*gL[mu]*loop*gL[mu])*trace(q3*g5)
 */
 /*******************************************************************************
 *                  TWeakNeutral4ptDisc implementation                         *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TWeakNeutral4ptDisc::TWeakNeutral4ptDisc(const std::string name)
 : Module<WeakHamiltonianPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TWeakNeutral4ptDisc::getInput(void)
 {
    std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
    return in;
 }
 std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::setup(void)
 {
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TWeakNeutral4ptDisc::execute(void)
 {
    LOG(Message) << "Computing Weak Hamiltonian neutral disconnected contractions '" 
                 << getName() << "' using quarks '" << par().q1 << "', '" 
                 << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                 << "'." << std::endl;
    CorrWriter             writer(par().output);
    PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
    PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
    PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
    PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
    Gamma g5            = Gamma(Gamma::Algebra::Gamma5);
    LatticeComplex        expbuf(env().getGrid());
    std::vector<TComplex> corrbuf;
    std::vector<Result>   result(n_neut_disc_diag);
    unsigned int ndim   = env().getNd();
    PropagatorField              tmp(env().getGrid());
    std::vector<PropagatorField> meson(ndim, tmp);
    std::vector<PropagatorField> loop(ndim, tmp);
    LatticeComplex               curr(env().getGrid());
    // Setup for type 1 contractions.
    for (int mu = 0; mu < ndim; ++mu)
    {
        meson[mu] = MAKE_DISC_MESON(q1, q2, GammaL(Gamma::gmu[mu]));
        loop[mu] = MAKE_DISC_LOOP(q4, GammaL(Gamma::gmu[mu]));
    }
    curr = MAKE_DISC_CURR(q3, GammaL(Gamma::Algebra::Gamma5));
    // Perform type 1 contractions.    
    SUM_MU(expbuf, trace(meson[mu]*loop[mu]))
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_1_diag], "HW_disc0_1")
    // Perform type 2 contractions.
    SUM_MU(expbuf, trace(meson[mu])*trace(loop[mu]))
    expbuf *= curr;
    MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
    write(writer, "HW_disc0", result);
 }
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
@@ -1,59 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
 Copyright (C) 2017
 Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         WeakNeutral4ptDisc                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 enum
 {
    neut_disc_1_diag = 0,
    neut_disc_2_diag = 1,
    n_neut_disc_diag = 2
 };
 // Neutral 4pt disconnected subdiagram contractions.
 #define MAKE_DISC_MESON(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
 #define MAKE_DISC_LOOP(Q_LOOP, gamma) (Q_LOOP*gamma)
 #define MAKE_DISC_CURR(Q_c, gamma) (trace(Q_c*gamma))
 MAKE_WEAK_MODULE(WeakNeutral4ptDisc)
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
--- a/extras/Hadrons/Modules/MGauge/Load.cc
+++ b/extras/Hadrons/Modules/MGauge/Load.cc
@@ -65,7 +65,7 @@ void TLoad::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TLoad::execute(void)
 {
-    FieldMetaData  header;
+    NerscField  header;
    std::string fileName = par().file + "."
                           + std::to_string(env().getTrajectory());
@@ -74,5 +74,5 @@ void TLoad::execute(void)
    LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
    NerscIO::readConfiguration(U, header, fileName);
    LOG(Message) << "NERSC header:" << std::endl;
-    dump_meta_data(header, LOG(Message));
+    dump_nersc_header(header, LOG(Message));
 }
--- a/extras/Hadrons/Modules/MGauge/Load.hpp
+++ b/extras/Hadrons/Modules/MGauge/Load.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MGauge_Load_hpp_
+#ifndef Hadrons_Load_hpp_
-#define Hadrons_MGauge_Load_hpp_
+#define Hadrons_Load_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -70,4 +70,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MGauge_Load_hpp_
+#endif // Hadrons_Load_hpp_
--- a/extras/Hadrons/Modules/MGauge/Random.hpp
+++ b/extras/Hadrons/Modules/MGauge/Random.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MGauge_Random_hpp_
+#ifndef Hadrons_Random_hpp_
-#define Hadrons_MGauge_Random_hpp_
+#define Hadrons_Random_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MGauge_Random_hpp_
+#endif // Hadrons_Random_hpp_
--- a/extras/Hadrons/Modules/MGauge/StochEm.cc
+++ b/extras/Hadrons/Modules/MGauge/StochEm.cc
@@ -1,88 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 /******************************************************************************
 *                  TStochEm implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TStochEm::TStochEm(const std::string name)
 : Module<StochEmPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TStochEm::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 std::vector<std::string> TStochEm::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TStochEm::setup(void)
 {
    if (!env().hasRegisteredObject("_" + getName() + "_weight"))
    {
        env().registerLattice<EmComp>("_" + getName() + "_weight");
    }
    env().registerLattice<EmField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TStochEm::execute(void)
 {
    PhotonR photon(par().gauge, par().zmScheme);
    EmField &a = *env().createLattice<EmField>(getName());
    EmComp  *w;
    if (!env().hasCreatedObject("_" + getName() + "_weight"))
    {
        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
                     << par().gauge << ", zero-mode scheme: "
                     << par().zmScheme << ")..." << std::endl;
        w = env().createLattice<EmComp>("_" + getName() + "_weight");
        photon.StochasticWeight(*w);
    }
    else
    {
        w = env().getObject<EmComp>("_" + getName() + "_weight");
    }
    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
    photon.StochasticField(a, *env().get4dRng(), *w);
 }
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/extras/Hadrons/Modules/MGauge/StochEm.hpp
@@ -1,75 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_StochEm_hpp_
 #define Hadrons_MGauge_StochEm_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         StochEm                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 class StochEmPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                    PhotonR::Gauge,    gauge,
                                    PhotonR::ZmScheme, zmScheme);
 };
 class TStochEm: public Module<StochEmPar>
 {
 public:
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TStochEm(const std::string name);
    // destructor
    virtual ~TStochEm(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_StochEm_hpp_
--- a/extras/Hadrons/Modules/MGauge/Unit.hpp
+++ b/extras/Hadrons/Modules/MGauge/Unit.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MGauge_Unit_hpp_
+#ifndef Hadrons_Unit_hpp_
-#define Hadrons_MGauge_Unit_hpp_
+#define Hadrons_Unit_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MGauge_Unit_hpp_
+#endif // Hadrons_Unit_hpp_
--- a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+++ b/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
@@ -1,132 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
 Copyright (C) 2016
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MLoop_NoiseLoop_hpp_
 #define Hadrons_MLoop_NoiseLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 Noise loop propagator
 -----------------------------
 * loop_x = q_x * adj(eta_x)
 * options:
 - q = Result of inversion on noise source.
 - eta = noise source.
 */
 /******************************************************************************
 *                         NoiseLoop                                          *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MLoop)
 class NoiseLoopPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(NoiseLoopPar,
                                    std::string, q,
                                    std::string, eta);
 };
 template <typename FImpl>
 class TNoiseLoop: public Module<NoiseLoopPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TNoiseLoop(const std::string name);
    // destructor
    virtual ~TNoiseLoop(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(NoiseLoop, TNoiseLoop<FIMPL>, MLoop);
 /******************************************************************************
 *                 TNoiseLoop implementation                                  *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TNoiseLoop<FImpl>::TNoiseLoop(const std::string name)
 : Module<NoiseLoopPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TNoiseLoop<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().eta};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TNoiseLoop<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TNoiseLoop<FImpl>::setup(void)
 {
    env().template registerLattice<PropagatorField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TNoiseLoop<FImpl>::execute(void)
 {
    PropagatorField &loop = *env().template createLattice<PropagatorField>(getName());
    PropagatorField &q    = *env().template getObject<PropagatorField>(par().q);
    PropagatorField &eta  = *env().template getObject<PropagatorField>(par().eta);
    loop = q*adj(eta);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MLoop_NoiseLoop_hpp_
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -1,226 +0,0 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                     TChargedProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TChargedProp::TChargedProp(const std::string name)
 : Module<ChargedPropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TChargedProp::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().emField};
    return in;
 }
 std::vector<std::string> TChargedProp::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TChargedProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
    phaseName_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
    GFSrcName_ = "_" + getName() + "_DinvSrc";
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
    }
    if (!env().hasRegisteredObject(phaseName_[0]))
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            env().registerLattice<ScalarField>(phaseName_[mu]);
        }
    }
    if (!env().hasRegisteredObject(GFSrcName_))
    {
        env().registerLattice<ScalarField>(GFSrcName_);
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
    // CACHING ANALYTIC EXPRESSIONS
    ScalarField &source = *env().getObject<ScalarField>(par().source);
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    // cache free scalar propagator
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass);
    }
    else
    {
        freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
    }
    // cache G*F*src
    if (!env().hasCreatedObject(GFSrcName_))
    {
        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
    }
    else
    {
        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
    }
    // cache phases
    if (!env().hasCreatedObject(phaseName_[0]))
    {
        std::vector<int> &l = env().getGrid()->_fdimensions;
        LOG(Message) << "Caching shift phases..." << std::endl;
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            Real    twoPiL = M_PI*2./l[mu];
            phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
            LatticeCoordinate(*(phase_[mu]), mu);
            *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
        }
    }
    else
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
        }
    }
    // PROPAGATOR CALCULATION
    LOG(Message) << "Computing charged scalar propagator"
                 << " (mass= " << par().mass
                 << ", charge= " << par().charge << ")..." << std::endl;
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField buf(env().getGrid());
    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
    double      q = par().charge;
    // G*F*Src
    prop = GFSrc;
    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
    buf = GFSrc;
    momD1(buf, fft);
    buf = G*buf;
    prop = prop - q*buf;
    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
    momD1(buf, fft);
    prop = prop + q*q*G*buf;
    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
    buf = GFSrc;
    momD2(buf, fft);
    prop = prop - q*q*G*buf;
    // final FT
    fft.FFT_all_dim(prop, prop, FFT::backward);
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        std::string           filename = par().output + "." +
                                         std::to_string(env().getTrajectory());
        LOG(Message) << "Saving zero-momentum projection to '"
                     << filename << "'..." << std::endl;
        CorrWriter            writer(filename);
        std::vector<TComplex> vecBuf;
        std::vector<Complex>  result;
        sliceSum(prop, vecBuf, Tp);
        result.resize(vecBuf.size());
        for (unsigned int t = 0; t < vecBuf.size(); ++t)
        {
            result[t] = TensorRemove(vecBuf[t]);
        }
        write(writer, "charge", q);
        write(writer, "prop", result);
    }
 }
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    Complex     ci(0.0,1.0);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result - ci*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + ci*adj(*phase_[mu])*buf;
    }
    s = result;
 }
 void TChargedProp::momD2(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + .5*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);        
        buf = Amu*Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + .5*adj(*phase_[mu])*buf;
    }
    s = result;
 }
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -1,61 +0,0 @@
 #ifndef Hadrons_MScalar_ChargedProp_hpp_
 #define Hadrons_MScalar_ChargedProp_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Charged scalar propagator                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class ChargedPropPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
                                    std::string, emField,
                                    std::string, source,
                                    double,      mass,
                                    double,      charge,
                                    std::string, output);
 };
 class TChargedProp: public Module<ChargedPropPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TChargedProp(const std::string name);
    // destructor
    virtual ~TChargedProp(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void momD1(ScalarField &s, FFT &fft);
    void momD2(ScalarField &s, FFT &fft);
 private:
    std::string                freeMomPropName_, GFSrcName_;
    std::vector<std::string>   phaseName_;
    ScalarField                *freeMomProp_, *GFSrc_;
    std::vector<ScalarField *> phase_;
    EmField                    *A;
 };
 MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_ChargedProp_hpp_
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.cc
@@ -1,79 +0,0 @@
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                        TFreeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TFreeProp::TFreeProp(const std::string name)
 : Module<FreePropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TFreeProp::getInput(void)
 {
    std::vector<std::string> in = {par().source};
    return in;
 }
 std::vector<std::string> TFreeProp::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TFreeProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TFreeProp::execute(void)
 {
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField &source = *env().getObject<ScalarField>(par().source);
    ScalarField *freeMomProp;
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass);
    }
    else
    {
        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
    }
    LOG(Message) << "Computing free scalar propagator..." << std::endl;
    SIMPL::FreePropagator(source, prop, *freeMomProp);
    if (!par().output.empty())
    {
        TextWriter            writer(par().output + "." +
                                     std::to_string(env().getTrajectory()));
        std::vector<TComplex> buf;
        std::vector<Complex>  result;
        sliceSum(prop, buf, Tp);
        result.resize(buf.size());
        for (unsigned int t = 0; t < buf.size(); ++t)
        {
            result[t] = TensorRemove(buf[t]);
        }
        write(writer, "prop", result);
    }
 }
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -1,50 +0,0 @@
 #ifndef Hadrons_MScalar_FreeProp_hpp_
 #define Hadrons_MScalar_FreeProp_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                               FreeProp                                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class FreePropPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
                                    std::string, source,
                                    double,      mass,
                                    std::string, output);
 };
 class TFreeProp: public Module<FreePropPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
 public:
    // constructor
    TFreeProp(const std::string name);
    // destructor
    virtual ~TFreeProp(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    std::string freeMomPropName_;
 };
 MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_FreeProp_hpp_
--- a/extras/Hadrons/Modules/MScalar/Scalar.hpp
+++ b/extras/Hadrons/Modules/MScalar/Scalar.hpp
@@ -1,6 +0,0 @@
 #ifndef Hadrons_Scalar_hpp_
 #define Hadrons_Scalar_hpp_
 #define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
 #endif // Hadrons_Scalar_hpp_
--- a/extras/Hadrons/Modules/MSink/Point.hpp
+++ b/extras/Hadrons/Modules/MSink/Point.hpp
@@ -1,114 +0,0 @@
 #ifndef Hadrons_MSink_Point_hpp_
 #define Hadrons_MSink_Point_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                   Point                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSink)
 class PointPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar,
                                    std::string, mom);
 };
 template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SINK_TYPE_ALIASES();
 public:
    // constructor
    TPoint(const std::string name);
    // destructor
    virtual ~TPoint(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
 MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
 /******************************************************************************
 *                          TPoint implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TPoint<FImpl>::TPoint(const std::string name)
 : Module<PointPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TPoint<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TPoint<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::setup(void)
 {
    unsigned int size;
    size = env().template lattice4dSize<LatticeComplex>();
    env().registerObject(getName(), size);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::execute(void)
 {
    std::vector<Real> p = strToVec<Real>(par().mom);
    LatticeComplex    ph(env().getGrid()), coor(env().getGrid());
    Complex           i(0.0,1.0);
    LOG(Message) << "Setting up point sink function for momentum ["
                 << par().mom << "]" << std::endl;
    ph = zero;
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    auto sink = [ph](const PropagatorField &field)
    {
        SlicedPropagator res;
        PropagatorField  tmp = ph*field;
        sliceSum(tmp, res, Tp);
        return res;
    };
    env().setObject(getName(), new SinkFn(sink));
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSink_Point_hpp_
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MSolver_RBPrecCG_hpp_
+#ifndef Hadrons_RBPrecCG_hpp_
-#define Hadrons_MSolver_RBPrecCG_hpp_
+#define Hadrons_RBPrecCG_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -53,7 +53,7 @@ template <typename FImpl>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TRBPrecCG(const std::string name);
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MSolver_RBPrecCG_hpp_
+#endif // Hadrons_RBPrecCG_hpp_
--- a/extras/Hadrons/Modules/MSource/Point.hpp
+++ b/extras/Hadrons/Modules/MSource/Point.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MSource_Point_hpp_
+#ifndef Hadrons_Point_hpp_
-#define Hadrons_MSource_Point_hpp_
+#define Hadrons_Point_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,7 +63,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TPoint(const std::string name);
@@ -78,8 +78,7 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSource);
+MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource);
 MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource);
 /******************************************************************************
 *                       TPoint template implementation                       *
@@ -133,4 +132,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MSource_Point_hpp_
+#endif // Hadrons_Point_hpp_
--- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/extras/Hadrons/Modules/MSource/SeqGamma.hpp
@@ -6,7 +6,6 @@ Source file: extras/Hadrons/Modules/MSource/SeqGamma.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Copyright (C) 2017
 Author: Antonin Portelli <antonin.portelli@me.com>
@@ -28,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MSource_SeqGamma_hpp_
+#ifndef Hadrons_SeqGamma_hpp_
-#define Hadrons_MSource_SeqGamma_hpp_
+#define Hadrons_SeqGamma_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -72,7 +71,7 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TSeqGamma(const std::string name);
@@ -150,9 +149,9 @@ void TSeqGamma<FImpl>::execute(void)
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
-        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
+        ph = ph + p[mu]*coor;
    }
-    ph = exp((Real)(2*M_PI)*i*ph);
+    ph = exp(i*ph);
    LatticeCoordinate(t, Tp);
    src = where((t >= par().tA) and (t <= par().tB), ph*(g*q), 0.*q);
 }
@@ -161,4 +160,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MSource_SeqGamma_hpp_
+#endif // Hadrons_SeqGamma_hpp_
--- a/extras/Hadrons/Modules/MSource/Wall.hpp
+++ b/extras/Hadrons/Modules/MSource/Wall.hpp
@@ -1,147 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MSource/Wall.hpp
 Copyright (C) 2017
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MSource_WallSource_hpp_
 #define Hadrons_MSource_WallSource_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 Wall source
 -----------------------------
 * src_x = delta(x_3 - tW) * exp(i x.mom)
 * options:
 - tW: source timeslice (integer)
 - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.")
 */
 /******************************************************************************
 *                         Wall                                               *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSource)
 class WallPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WallPar,
                                    unsigned int, tW,
                                    std::string, mom);
 };
 template <typename FImpl>
 class TWall: public Module<WallPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWall(const std::string name);
    // destructor
    virtual ~TWall(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource);
 /******************************************************************************
 *                 TWall implementation                                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWall<FImpl>::TWall(const std::string name)
 : Module<WallPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWall<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWall<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWall<FImpl>::setup(void)
 {
    env().template registerLattice<PropagatorField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWall<FImpl>::execute(void)
 {    
    LOG(Message) << "Generating wall source at t = " << par().tW 
                 << " with momentum " << par().mom << std::endl;
    PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
    Lattice<iScalar<vInteger>> t(env().getGrid());
    LatticeComplex             ph(env().getGrid()), coor(env().getGrid());
    std::vector<Real>          p;
    Complex                    i(0.0,1.0);
    p  = strToVec<Real>(par().mom);
    ph = zero;
    for(unsigned int mu = 0; mu < Nd; mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    LatticeCoordinate(t, Tp);
    src = 1.;
    src = where((t == par().tW), src*ph, 0.*src);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSource_WallSource_hpp_
--- a/extras/Hadrons/Modules/MSource/Z2.hpp
+++ b/extras/Hadrons/Modules/MSource/Z2.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_MSource_Z2_hpp_
+#ifndef Hadrons_Z2_hpp_
-#define Hadrons_MSource_Z2_hpp_
+#define Hadrons_Z2_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -67,7 +67,7 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZ2(const std::string name);
@@ -82,8 +82,7 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource);
+MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource);
 MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
 /******************************************************************************
 *                       TZ2 template implementation                          *
@@ -149,4 +148,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MSource_Z2_hpp_
+#endif // Hadrons_Z2_hpp_
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -1,5 +1,34 @@
-#ifndef Hadrons_MFermion_GaugeProp_hpp_
+/*************************************************************************************
-#define Hadrons_MFermion_GaugeProp_hpp_
+
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/Quark.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Quark_hpp_
 #define Hadrons_Quark_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -8,29 +37,27 @@
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                                GaugeProp                                   *
+ *                               TQuark                                       *
 ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MFermion)
+class QuarkPar: Serializable
 class GaugePropPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
                                    std::string, source,
                                    std::string, solver);
 };
 template <typename FImpl>
-class TGaugeProp: public Module<GaugePropPar>
+class TQuark: public Module<QuarkPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    TYPE_ALIASES(FImpl,);
 public:
    // constructor
-    TGaugeProp(const std::string name);
+    TQuark(const std::string name);
    // destructor
-    virtual ~TGaugeProp(void) = default;
+    virtual ~TQuark(void) = default;
-    // dependency relation
+    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
@@ -42,20 +69,20 @@ private:
    SolverFn     *solver_{nullptr};
 };
-MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
+MODULE_REGISTER(Quark, TQuark<FIMPL>);
 /******************************************************************************
- *                      TGaugeProp implementation                             *
+ *                          TQuark implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TGaugeProp<FImpl>::TGaugeProp(const std::string name)
+TQuark<FImpl>::TQuark(const std::string name)
-: Module<GaugePropPar>(name)
+: Module(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
+std::vector<std::string> TQuark<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().solver};
@@ -63,7 +90,7 @@ std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
 }
 template <typename FImpl>
-std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
+std::vector<std::string> TQuark<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName(), getName() + "_5d"};
@@ -72,7 +99,7 @@ std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TGaugeProp<FImpl>::setup(void)
+void TQuark<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().solver);
    env().template registerLattice<PropagatorField>(getName());
@@ -84,13 +111,13 @@ void TGaugeProp<FImpl>::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TGaugeProp<FImpl>::execute(void)
+void TQuark<FImpl>::execute(void)
 {
    LOG(Message) << "Computing quark propagator '" << getName() << "'"
-    << std::endl;
+                 << std::endl;
    FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
-    tmp(env().getGrid());
+                    tmp(env().getGrid());
    std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
    PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
@@ -101,7 +128,7 @@ void TGaugeProp<FImpl>::execute(void)
    }
    LOG(Message) << "Inverting using solver '" << par().solver
-    << "' on source '" << par().source << "'" << std::endl;
+                 << "' on source '" << par().source << "'" << std::endl;
    for (unsigned int s = 0; s < Ns; ++s)
    for (unsigned int c = 0; c < Nc; ++c)
    {
@@ -143,18 +170,16 @@ void TGaugeProp<FImpl>::execute(void)
        if (Ls_ > 1)
        {
            PropagatorField &p4d =
-            *env().template getObject<PropagatorField>(getName());
+                *env().template getObject<PropagatorField>(getName());
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
-            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
+            axpby_ssp_pplus(sol, 0., sol, 1., sol, 0, Ls_-1);
            ExtractSlice(tmp, sol, 0, 0);
            FermToProp(p4d, tmp, s, c);
        }
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_MFermion_GaugeProp_hpp_
+#endif // Hadrons_Quark_hpp_
--- a/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
+++ b/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
@@ -1,5 +1,5 @@
-#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#ifndef Hadrons____FILEBASENAME____hpp_
-#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#define Hadrons____FILEBASENAME____hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -41,4 +41,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#endif // Hadrons____FILEBASENAME____hpp_
--- a/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
+++ b/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
@@ -1,5 +1,5 @@
-#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#ifndef Hadrons____FILEBASENAME____hpp_
-#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#define Hadrons____FILEBASENAME____hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -82,4 +82,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
+#endif // Hadrons____FILEBASENAME____hpp_
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -1,38 +1,19 @@
 modules_cc =\
  Modules/MContraction/WeakHamiltonianEye.cc \
  Modules/MContraction/WeakHamiltonianNonEye.cc \
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MGauge/Load.cc \
  Modules/MGauge/Random.cc \
-  Modules/MGauge/StochEm.cc \
+  Modules/MGauge/Unit.cc
  Modules/MGauge/Unit.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MScalar/FreeProp.cc
 modules_hpp =\
  Modules/MAction/DWF.hpp \
  Modules/MAction/Wilson.hpp \
  Modules/MContraction/Baryon.hpp \
  Modules/MContraction/DiscLoop.hpp \
  Modules/MContraction/Gamma3pt.hpp \
  Modules/MContraction/Meson.hpp \
  Modules/MContraction/WeakHamiltonian.hpp \
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
  Modules/MFermion/GaugeProp.hpp \
  Modules/MGauge/Load.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MLoop/NoiseLoop.hpp \
  Modules/MScalar/ChargedProp.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
  Modules/MSink/Point.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSource/Point.hpp \
  Modules/MSource/SeqGamma.hpp \
-  Modules/MSource/Wall.hpp \
+  Modules/MSource/Z2.hpp \
-  Modules/MSource/Z2.hpp
+  Modules/Quark.hpp
--- a/extras/qed-fvol/Global.cc
+++ b/extras/qed-fvol/Global.cc
@@ -1,11 +0,0 @@
 #include <qed-fvol/Global.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
 QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
 QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
 QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
 QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");
--- a/extras/qed-fvol/Global.hpp
+++ b/extras/qed-fvol/Global.hpp
@@ -1,42 +0,0 @@
 #ifndef QedFVol_Global_hpp_
 #define QedFVol_Global_hpp_
 #include <Grid/Grid.h>
 #define BEGIN_QEDFVOL_NAMESPACE \
 namespace Grid {\
 using namespace QCD;\
 namespace QedFVol {\
 using Grid::operator<<;
 #define END_QEDFVOL_NAMESPACE }}
 /* the 'using Grid::operator<<;' statement prevents a very nasty compilation
 * error with GCC (clang compiles fine without it).
 */
 BEGIN_QEDFVOL_NAMESPACE
 class QedFVolLogger: public Logger
 {
 public:
    QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
                                                  GridLogColours, "BLACK"){};
 };
 #define LOG(channel) std::cout << QedFVolLog##channel
 #define QEDFVOL_ERROR(msg)\
 LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
           << __LINE__ << ")" << std::endl;\
 abort();
 #define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 extern QedFVolLogger QedFVolLogError;
 extern QedFVolLogger QedFVolLogWarning;
 extern QedFVolLogger QedFVolLogMessage;
 extern QedFVolLogger QedFVolLogIterative;
 extern QedFVolLogger QedFVolLogDebug;
 END_QEDFVOL_NAMESPACE
 #endif // QedFVol_Global_hpp_
--- a/extras/qed-fvol/Makefile.am
+++ b/extras/qed-fvol/Makefile.am
@@ -1,9 +0,0 @@
 AM_CXXFLAGS += -I$(top_srcdir)/extras
 bin_PROGRAMS = qed-fvol
 qed_fvol_SOURCES =   \
    qed-fvol.cc      \
    Global.cc
 qed_fvol_LDADD   = -lGrid
--- a/extras/qed-fvol/WilsonLoops.h
+++ b/extras/qed-fvol/WilsonLoops.h
@@ -1,265 +0,0 @@
 #ifndef QEDFVOL_WILSONLOOPS_H
 #define QEDFVOL_WILSONLOOPS_H
 #include <Global.hpp>
 BEGIN_QEDFVOL_NAMESPACE
 template <class Gimpl> class NewWilsonLoops : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
                           const int mu, const int nu) {
    // Annoyingly, must use either scope resolution to find dependent base
    // class,
    // or this-> ; there is no "this" in a static method. This forces explicit
    // Gimpl scope
    // resolution throughout the usage in this file, and rather defeats the
    // purpose of deriving
    // from Gimpl.
    plaq = Gimpl::CovShiftBackward(
        U[mu], mu, Gimpl::CovShiftBackward(
                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
  }
  //////////////////////////////////////////////////
  // trace of directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceDirPlaquette(LatticeComplex &plaq,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
    dirPlaquette(sp, U, mu, nu);
    plaq = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of plaquette
  //////////////////////////////////////////////////
  static void sitePlaquette(LatticeComplex &Plaq,
                            const std::vector<GaugeMat> &U) {
    LatticeComplex sitePlaq(U[0]._grid);
    Plaq = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static Real sumPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Plaq(Umu._grid);
    sitePlaquette(Plaq, U);
    TComplex Tp = sum(Plaq);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static Real avgPlaquette(const GaugeLorentz &Umu) {
    int ndim = Umu._grid->_ndimension;
    Real sumplaq = sumPlaquette(Umu);
    Real vol = Umu._grid->gSites();
    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
                           const int Rmu, const int Rnu,
                           const int mu, const int nu) {
    wl = U[nu];
    for(int i = 0; i < Rnu-1; i++){
      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
    }
    for(int i = 0; i < Rnu; i++){
      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
    }
  }
  //////////////////////////////////////////////////
  // trace of Wilson Loop oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceWilsonLoop(LatticeComplex &wl,
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
    GaugeMat sp(U[0]._grid);
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of Wilson loop
  //////////////////////////////////////////////////
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over planes of Wilson loop with length R1
  // in the time direction
  //////////////////////////////////////////////////
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    int ndim = U[0]._grid->_ndimension;
    Wl = zero;
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
    }
  }
  //////////////////////////////////////////////////
  // sum Wilson loop over all planes orthogonal to the time direction
  //////////////////////////////////////////////////
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteSpatialWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
 };
 END_QEDFVOL_NAMESPACE
 #endif // QEDFVOL_WILSONLOOPS_H
--- a/extras/qed-fvol/qed-fvol.cc
+++ b/extras/qed-fvol/qed-fvol.cc
@@ -1,88 +0,0 @@
 #include <Global.hpp>
 #include <WilsonLoops.h>
 using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
 typedef PhotonR::GaugeField             EmField;
 typedef PhotonR::GaugeLinkField         EmComp;
 const int NCONFIGS = 10;
 const int NWILSON = 10;
 int main(int argc, char *argv[])
 {
    // parse command line
    std::string parameterFileName;
    if (argc < 2)
    {
        std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
        std::cerr << std::endl;
        std::exit(EXIT_FAILURE);
    }
    parameterFileName = argv[1];
    // initialization
    Grid_init(&argc, &argv);
    QedFVolLogError.Active(GridLogError.isActive());
    QedFVolLogWarning.Active(GridLogWarning.isActive());
    QedFVolLogMessage.Active(GridLogMessage.isActive());
    QedFVolLogIterative.Active(GridLogIterative.isActive());
    QedFVolLogDebug.Active(GridLogDebug.isActive());
    LOG(Message) << "Grid initialized" << std::endl;
    // QED stuff
    std::vector<int> latt_size   = GridDefaultLatt();
    std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    GridCartesian    grid(latt_size,simd_layout,mpi_layout);
    GridParallelRNG  pRNG(&grid);
    PhotonR          photon(PhotonR::Gauge::feynman,
                            PhotonR::ZmScheme::qedL);
    EmField          a(&grid);
    EmField          expA(&grid);
    Complex imag_unit(0, 1);
    Real wlA;
    std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
    pRNG.SeedRandomDevice();
    LOG(Message) << "Wilson loop calculation beginning" << std::endl;
    for(int ic = 0; ic < NCONFIGS; ic++){
        LOG(Message) << "Configuration " << ic <<std::endl;
        photon.StochasticField(a, pRNG);
        // Exponentiate photon field
        expA = exp(imag_unit*a);
        // Calculate Wilson loops
        for(int iw=1; iw<=NWILSON; iw++){
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
            logWlAvg[iw-1] -= 2*log(wlA);
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
            logWlTime[iw-1] -= 2*log(wlA);
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
            logWlSpace[iw-1] -= 2*log(wlA);
        }
    }
    LOG(Message) << "Wilson loop calculation completed" << std::endl;
    // Calculate Wilson loops
    for(int iw=1; iw<=10; iw++){
        LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
        LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
        LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
        LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
    }
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
    return EXIT_SUCCESS;
 }
--- a/gcc-bug-report/README
+++ b/gcc-bug-report/README
@@ -20,17 +20,4 @@ The simple testcase in this directory is the submitted bug report that encapsula
 problem. The test case works with icpc and with clang++, but fails consistently on g++
 current variants.
-Peter
+Peter
 ************
 Second GCC bug reported, see Issue 100.
 https://wandbox.org/permlink/tzssJza6R9XnqANw
 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80652
 Getting Travis fails under gcc-5 for Test_simd, now that I added more comprehensive testing to the
 CI test suite. The limitations of Travis runtime limits & weak cores are being shown.
 Travis uses 5.4.1 for g++-5.
--- a/grid-config.in
+++ b/grid-config.in
@@ -1,86 +0,0 @@
 #! /bin/sh
 prefix=@prefix@
 exec_prefix=@exec_prefix@
 includedir=@includedir@
 usage()
 {
  cat <<EOF
 Usage: grid-config [OPTION]
 Known values for OPTION are:
  --prefix     show Grid installation prefix
  --cxxflags   print pre-processor and compiler flags
  --ldflags    print library linking flags
  --libs       print library linking information
  --summary    print full build summary
  --help       display this help and exit
  --version    output version information
  --git        print git revision
 EOF
  exit $1
 }
 if test $# -eq 0; then
  usage 1
 fi
 cflags=false
 libs=false
 while test $# -gt 0; do
  case "$1" in
    -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
    *) optarg= ;;
  esac
  case "$1" in
    --prefix)
      echo $prefix
    ;;
    --version)
      echo @VERSION@
      exit 0
    ;;
    --git)
      echo "@GRID_BRANCH@ @GRID_SHA@"
      exit 0
    ;;
    --help)
      usage 0
    ;;
    --cxxflags)
      echo @GRID_CXXFLAGS@
    ;;
    --ldflags)
      echo @GRID_LDFLAGS@
    ;;
    --libs)
      echo @GRID_LIBS@
    ;;
    --summary)
      echo ""
      echo "@GRID_SUMMARY@"
      echo ""
    ;;
    *)
      usage
      exit 1
    ;;
  esac
  shift
 done
 exit 0
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@@ -1,6 +1,6 @@
    /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Algorithms.h
@@ -37,24 +37,39 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/Chebyshev.h>
 #include <Grid/algorithms/approx/Remez.h>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientShifted.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 #include <Grid/algorithms/FFT.h>
 // Lanczos support
 #include <Grid/algorithms/iterative/MatrixUtils.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/CoarsenedMatrix.h>
 // Eigen/lanczos
 // EigCg
 // MCR
 // Pcg
 // Multishift CG
 // Hdcg
 // GCR
 // etc..
 // integrator/Leapfrog
 // integrator/Omelyan
 // integrator/ForceGradient
 // montecarlo/hmc
 // montecarlo/rhmc
 // montecarlo/metropolis
 // etc...
 #endif
--- a/lib/AlignedAllocator.cc
+++ b/lib/AlignedAllocator.cc
@@ -0,0 +1,65 @@
 #include <Grid/Grid.h>
 namespace Grid {
 int PointerCache::victim;
  PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
 void *PointerCache::Insert(void *ptr,size_t bytes) {
  if (bytes < 4096 ) return NULL;
 #ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  void * ret = NULL;
  int v = -1;
  for(int e=0;e<Ncache;e++) {
    if ( Entries[e].valid==0 ) {
      v=e; 
      break;
    }
  }
  if ( v==-1 ) {
    v=victim;
    victim = (victim+1)%Ncache;
  }
  if ( Entries[v].valid ) {
    ret = Entries[v].address;
    Entries[v].valid = 0;
    Entries[v].address = NULL;
    Entries[v].bytes = 0;
  }
  Entries[v].address=ptr;
  Entries[v].bytes  =bytes;
  Entries[v].valid  =1;
  return ret;
 }
 void *PointerCache::Lookup(size_t bytes) {
 if (bytes < 4096 ) return NULL;
 #ifdef _OPENMP
  assert(omp_in_parallel()==0);
 #endif 
  for(int e=0;e<Ncache;e++){
    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
      Entries[e].valid = 0;
      return Entries[e].address;
    }
  }
  return NULL;
 }
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -64,8 +64,6 @@ namespace Grid {
  };
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
@@ -94,34 +92,18 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-    //    if ( ptr != NULL ) 
+    
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
-    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
+
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
 #pragma omp parallel for
 #endif
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@@ -200,19 +182,10 @@ public:
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
-    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
 #pragma omp parallel for schedule(static)
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
--- a/lib/cartesian/Cartesian.h
+++ b/lib/cartesian/Cartesian.h
--- a/lib/communicator/Communicator.h
+++ b/lib/communicator/Communicator.h
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
-#ifdef GRID_COMMS_MPIT
+#ifdef GRID_COMMS_MPI3L
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
--- a/lib/DisableWarnings.h
+++ b/lib/DisableWarnings.h
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/DisableWarnings.h
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef DISABLE_WARNINGS_H
 #define DISABLE_WARNINGS_H
 //disables and intel compiler specific warning (in json.hpp)
 #pragma warning disable 488  
 #endif
--- a/lib/algorithms/FFT.h
+++ b/lib/algorithms/FFT.h
@@ -230,7 +230,6 @@ namespace Grid {
      // Barrel shift and collect global pencil
      std::vector<int> lcoor(Nd), gcoor(Nd);
      result = source;
      int pc = processor_coor[dim];
      for(int p=0;p<processors[dim];p++) {
        PARALLEL_REGION
        {
@@ -241,8 +240,7 @@ namespace Grid {
          for(int idx=0;idx<sgrid->lSites();idx++) {
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
            peekLocalSite(s,result,cbuf);
-	    cbuf[dim]+=((pc+p) % processors[dim])*L;
+            cbuf[dim]+=p*L;
 	    //            cbuf[dim]+=p*L;
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
@@ -280,6 +278,7 @@ namespace Grid {
      flops+= flops_call*NN;
      // writing out result
      int pc = processor_coor[dim];
      PARALLEL_REGION
      {
        std::vector<int> clbuf(Nd), cgbuf(Nd);
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -38,12 +38,52 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_H
 #define GRID_H
-#include <Grid/GridCore.h>
+///////////////////
-#include <Grid/GridQCDcore.h>
+// Std C++ dependencies
-#include <Grid/qcd/action/Action.h>
+///////////////////
-#include <Grid/qcd/utils/GaugeFix.h>
+#include <cassert>
-#include <Grid/qcd/smearing/Smearing.h>
+#include <complex>
-#include <Grid/parallelIO/MetaData.h>
+#include <vector>
-#include <Grid/qcd/hmc/HMC_aggregate.h>
+#include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 ///////////////////
 // Grid headers
 ///////////////////
 #include "Config.h"
 #include <Grid/Timer.h>
 #include <Grid/PerfCount.h>
 #include <Grid/Log.h>
 #include <Grid/AlignedAllocator.h>
 #include <Grid/Simd.h>
 #include <Grid/serialisation/Serialisation.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
 #include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
 #include <Grid/Lattice.h>      
 #include <Grid/Cshift.h>       
 #include <Grid/Stencil.h>      
 #include <Grid/Algorithms.h>   
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/FFT.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
 #endif
--- a/lib/GridStd.h
+++ b/lib/GridStd.h
@@ -1,29 +0,0 @@
 #ifndef GRID_STD_H
 #define GRID_STD_H
 ///////////////////
 // Std C++ dependencies
 ///////////////////
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <string>
 #include <iostream>
 #include <iomanip>
 #include <random>
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 #include <zlib.h>
 ///////////////////
 // Grid config
 ///////////////////
 #include "Config.h"
 #endif /* GRID_STD_H */
--- a/lib/Grid_Eigen_Dense.h
+++ b/lib/Grid_Eigen_Dense.h
@@ -1,9 +0,0 @@
 #pragma once
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 #include <Grid/Eigen/Dense>
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -1,6 +1,6 @@
-/*************************************************************************************
+    /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Init.cc
@@ -36,20 +36,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <stdint.h>
 #include <unistd.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
+#include <sys/stat.h> 
 #include <sys/time.h>
 #include <signal.h>
 #include <iostream>
 #include <iterator>
 #include <Grid/Grid.h>
 #include <algorithm>
 #include <iterator>
 #include <cstdlib>
 #include <memory>
 #include <Grid/Grid.h>
 #include <Grid/util/CompilerCompatible.h>
 #include <fenv.h>
 #ifdef __APPLE__
@@ -95,14 +92,14 @@ const std::vector<int> GridDefaultSimd(int dims,int nsimd)
      if ( nn>=2) {
 	layout[d]=2;
 	nn/=2;
-      } else {
+      } else { 
 	layout[d]=1;
      }
    }
    assert(nn==1);
    return layout;
 }
-
+  
 ////////////////////////////////////////////////////////////
 // Command line parsing assist for stock controls
 ////////////////////////////////////////////////////////////
@@ -146,7 +143,7 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
    vec.push_back(i);
    if(std::ispunct(ss.peek()))
      ss.ignore();
-  }
+  }    
  return;
 }
@@ -219,68 +216,11 @@ void Grid_init(int *argc,char ***argv)
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
    GridCmdOptionInt(arg,MB);
-    uint64_t MB64 = MB;
+    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
    CartesianCommunicator::Hugepages = 1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  CartesianCommunicator::Init(argc,argv);
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
    Grid_quiesce_nodes();
  } else { 
    FILE *fp;
    std::ostringstream fname;
    fname<<"Grid.stdout.";
    fname<<CartesianCommunicator::RankWorld();
    fp=freopen(fname.str().c_str(),"w",stdout);
    assert(fp!=(FILE *)NULL);
  }
  ////////////////////////////////////
  // Banner
  ////////////////////////////////////
  if ( CartesianCommunicator::RankWorld() == 0 ) { 
    std::cout <<std::endl;
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__"<<std::endl; 
    std::cout  << "__|_                                    _|__"<<std::endl; 
    std::cout  << "__|_   GGGG    RRRR    III    DDDD      _|__"<<std::endl;
    std::cout  << "__|_  G        R   R    I     D   D     _|__"<<std::endl;
    std::cout  << "__|_  G        R   R    I     D    D    _|__"<<std::endl;
    std::cout  << "__|_  G  GG    RRRR     I     D    D    _|__"<<std::endl;
    std::cout  << "__|_  G   G    R  R     I     D   D     _|__"<<std::endl;
    std::cout  << "__|_   GGGG    R   R   III    DDDD      _|__"<<std::endl;
    std::cout  << "__|_                                    _|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; 
    std::cout  << "  |  |  |  |  |  |  |  |  |  |  |  |  |  |  "<<std::endl; 
    std::cout << std::endl;
    std::cout << std::endl;
    std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
    std::cout << std::endl;
    std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
    std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
    std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl;
    std::cout << "(at your option) any later version."<<std::endl;
    std::cout << std::endl;
    std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl;
    std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
    std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
    std::cout << "GNU General Public License for more details."<<std::endl;
    std::cout << std::endl;
  }
  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
@@ -290,6 +230,9 @@ void Grid_init(int *argc,char ***argv)
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
    arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log");
@@ -305,80 +248,101 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --help : this message"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
-    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
+    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
+    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;    
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
-    std::cout<<GridLogMessage<<std::endl;
+    std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --log list      : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;    
-    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
+    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    exit(EXIT_SUCCESS);
  }
  ////////////////////////////////////
  // Banner
  ////////////////////////////////////
  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
  std::string COL_BLACK  = GridLogColours.colour["BLACK"];
  std::string COL_GREEN  = GridLogColours.colour["GREEN"];
  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
  std::cout <<std::endl;
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl;
  std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl; 
  std::cout << std::endl;
  std::cout << std::endl;
  std::cout <<COL_YELLOW<< std::endl;
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
  std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl;
  std::cout << "(at your option) any later version."<<std::endl;
  std::cout << std::endl;
  std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl;
  std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
  std::cout << "GNU General Public License for more details."<<std::endl;
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
  ////////////////////////////////////
  // Debug and performance options
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
    QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
    QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
    QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
  } else {
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
-  CartesianCommunicator::nCommThreads = -1;
+
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
+
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
    GridLogTimestamp(0);
-  } else {
+  } else { 
    GridLogTimestamp(1);
  }
@@ -387,12 +351,9 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_mpi);
  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
  if ( CartesianCommunicator::Hugepages) {
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
+    std::cout<<GridLogMessage<<"Grid Decomposition\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
@@ -405,39 +366,30 @@ void Grid_init(int *argc,char ***argv)
  Grid_is_initialised = 1;
 }
-
+  
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) 
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
 #if defined (GRID_COMMS_SHMEM)
  shmem_finalize();
 #endif
 }
 void GridLogLayout() {
    std::cout << GridLogMessage << "Grid Layout\n";
    std::cout << GridLogMessage << "\tGlobal lattice size  : "<< GridCmdVectorIntToString(GridDefaultLatt()) << std::endl;
    std::cout << GridLogMessage << "\tOpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
    std::cout << GridLogMessage << "\tMPI tasks            : "<< GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
 }
 void * Grid_backtrace_buffer[_NBACKTRACE];
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-  fprintf(stderr,"Caught signal %d\n",si->si_signo);
+  printf("Caught signal %d\n",si->si_signo);
-  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
+  printf("  mem address %llx\n",(unsigned long long)si->si_addr);
-  fprintf(stderr,"         code %d\n",si->si_code);
+  printf("         code %d\n",si->si_code);
  // Linux/Posix
 #ifdef __linux__
  // And x86 64bit
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
-  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
+  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  printf("  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
@@ -460,11 +412,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r15);
 #endif
 #endif
-  fflush(stderr);
+  BACKTRACE();
  BACKTRACEFP(stderr);
  fprintf(stderr,"Called backtrace\n");
  fflush(stdout);
  fflush(stderr);
  exit(0);
  return;
 };
@@ -477,12 +425,9 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
  sigaction(SIGBUS,&sa,NULL);
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
 }
 }
--- a/lib/util/Init.h
+++ b/lib/util/Init.h
@@ -1,6 +1,6 @@
    /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
+    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Init.h
@@ -46,7 +46,6 @@ namespace Grid {
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  void GridLogTimestamp(int);
  void GridLogLayout();
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
--- a/lib/lattice/Lattice.h
+++ b/lib/lattice/Lattice.h
--- a/lib/util/Lexicographic.h
+++ b/lib/util/Lexicographic.h
@@ -7,7 +7,7 @@ namespace Grid{
  class Lexicographic {
  public:
-    static inline void CoorFromIndex (std::vector<int>& coor,int index,const std::vector<int> &dims){
+    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
@@ -16,7 +16,7 @@ namespace Grid{
      }
    }
-    static inline void IndexFromCoor (const std::vector<int>& coor,int &index,const std::vector<int> &dims){
+    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@@ -29,11 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
 #include <Grid/util/CompilerCompatible.h>
 #include <cxxabi.h>
 #include <memory>
 namespace Grid {
@@ -95,7 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
--- a/lib/log/Log.h
+++ b/lib/log/Log.h
@@ -110,8 +110,8 @@ public:
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
-      stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
+      stream << log.background()<< std::setw(10) << std::left << log.topName << log.background()<< " : ";
-      stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
+      stream << log.colour() << std::setw(14) << std::left << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	StopWatch.Stop();
 	GridTime now = StopWatch.Elapsed();
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif
-if BUILD_COMMS_MPIT
+if BUILD_COMMS_MPI3L
-  extra_sources+=communicator/Communicator_mpit.cc
+  extra_sources+=communicator/Communicator_mpi3_leader.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
--- a/lib/Old/Endeavour.tgz
+++ b/lib/Old/Endeavour.tgz
--- a/lib/Old/Tensor_peek.h
+++ b/lib/Old/Tensor_peek.h
@@ -0,0 +1,154 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Old/Tensor_peek.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_PEEK_H
 #define GRID_MATH_PEEK_H
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////
 // Peek on a specific index; returns a scalar in that index, tensor inherits rest
 //////////////////////////////////////////////////////////////////////////////
 // If we hit the right index, return scalar with no further recursion
 //template<int Level> inline ComplexF peekIndex(const ComplexF arg) { return arg;}
 //template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;}
 //template<int Level> inline RealF peekIndex(const RealF arg) { return arg;}
 //template<int Level> inline RealD peekIndex(const RealD arg) { return arg;}
 #if 0
 // Scalar peek, no indices
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg) ->  iScalar<vtype> 
 {
  return arg;
 }
 // Vector peek, one index
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iVector<vtype,N> &arg,int i) -> iScalar<vtype> // Index matches
 {
  iScalar<vtype> ret;                              // return scalar
  ret._internal = arg._internal[i];
  return ret;
 }
 // Matrix peek, two indices
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->  iScalar<vtype>
 {
  iScalar<vtype> ret;                              // return scalar
  ret._internal = arg._internal[i][j];
  return ret;
 }
 /////////////
 // No match peek for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
 /////////////
 // scalar
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg) -> iScalar<decltype(peekIndex<Level>(arg._internal))>
 {
  iScalar<decltype(peekIndex<Level>(arg._internal))> ret;
  ret._internal= peekIndex<Level>(arg._internal);
  return ret;
 }
 template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg,int i) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i))> 
 {
  iScalar<decltype(peekIndex<Level>(arg._internal,i))> ret;
  ret._internal=peekIndex<Level>(arg._internal,i);
  return ret;
 }
 template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iScalar<vtype> &arg,int i,int j) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))>
 {
  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> ret;
  ret._internal=peekIndex<Level>(arg._internal,i,j);
  return ret;
 }
 // vector
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 auto peekIndex(const iVector<vtype,N> &arg) ->   iVector<decltype(peekIndex<Level>(arg._internal[0])),N>
 {
  iVector<decltype(peekIndex<Level>(arg._internal[0])),N> ret;
  for(int ii=0;ii<N;ii++){
    ret._internal[ii]=peekIndex<Level>(arg._internal[ii]);
  }
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iVector<vtype,N> &arg,int i) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N>
 {
  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret;
  for(int ii=0;ii<N;ii++){
    ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i);
  }
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iVector<vtype,N> &arg,int i,int j) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> 
 {
  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> ret;
  for(int ii=0;ii<N;ii++){
    ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i,j);
  }
  return ret;
 }
 // matrix
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 auto peekIndex(const iMatrix<vtype,N> &arg) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> 
 {
  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> ret;
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj]);// Could avoid this because peeking a scalar is dumb
  }}
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iMatrix<vtype,N> &arg,int i) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N>
 {
  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> ret;
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i);
  }}
  return ret;
 }
 template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N>
 {
  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> ret;
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i,j);
  }}
  return ret;
 }
 #endif
 }
 #endif
--- a/lib/Old/Tensor_poke.h
+++ b/lib/Old/Tensor_poke.h
@@ -0,0 +1,127 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Old/Tensor_poke.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_MATH_POKE_H
 #define GRID_MATH_POKE_H
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////
 // Poke a specific index; 
 //////////////////////////////////////////////////////////////////////////////
 #if 0
 // Scalar poke
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<vtype> &arg)
 {
  ret._internal = arg._internal;
 }
 // Vector poke, one index
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, const iScalar<vtype> &arg,int i)
 {
  ret._internal[i] = arg._internal;
 }
 //Matrix poke, two indices
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j)
 {
  ret._internal[i][j] = arg._internal;
 }
 /////////////
 // No match poke for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
 /////////////
 // scalar
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
 void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal))>  &arg)
 {
  pokeIndex<Level>(ret._internal,arg._internal);
 }
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0))> &arg, int i)
 {
  pokeIndex<Level>(ret._internal,arg._internal,i);
 }
 template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0,0))> &arg,int i,int j)
 {
  pokeIndex<Level>(ret._internal,arg._internal,i,j);
 }
 // Vector
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, iVector<decltype(peekIndex<Level>(ret._internal)),N>  &arg)
 {
  for(int ii=0;ii<N;ii++){
    pokeIndex<Level>(ret._internal[ii],arg._internal[ii]);
  }
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i)
 {
  for(int ii=0;ii<N;ii++){
    pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i);
  }
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg,int i,int j)
 {
  for(int ii=0;ii<N;ii++){
    pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i,j);
  }
 }
 // Matrix
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal)),N> &arg)		 
 {
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj]);
  }}
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i)
 {
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i);
  }}
 }
 template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg, int i,int j)
 {
  for(int ii=0;ii<N;ii++){
  for(int jj=0;jj<N;jj++){
    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j);
  }}
 }
 #endif
 }
 #endif
--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@@ -26,8 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
-#include <Grid/perfmon/PerfCount.h>
+#include <Grid/PerfCount.h>
 namespace Grid {
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef KNL
+#ifdef AVX512
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
--- a/lib/perfmon/PerfCount.h
+++ b/lib/perfmon/PerfCount.h
@@ -172,7 +172,7 @@ public:
    const char * name = PerformanceCounterConfigs[PCT].name;
    fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (fd == -1) {
-      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
    int norm = PerformanceCounterConfigs[PCT].normalisation;
@@ -181,7 +181,7 @@ public:
    name = PerformanceCounterConfigs[norm].name;
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (cyclefd == -1) {
-      fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
 #endif
@@ -205,14 +205,13 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
    size_t ign;
 #ifdef __linux__
    ssize_t ign;
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ign=::read(fd, &count, sizeof(long long));
-      ign+=::read(cyclefd, &cycles, sizeof(long long));
+      ign=::read(cyclefd, &cycles, sizeof(long long));
      assert(ign=2*sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
--- a/lib/simd/Simd.h
+++ b/lib/simd/Simd.h
@@ -172,8 +172,8 @@ namespace Grid {
 };
-#include <Grid/simd/Grid_vector_types.h>
+#include "simd/Grid_vector_types.h"
-#include <Grid/simd/Grid_vector_unops.h>
+#include "simd/Grid_vector_unops.h"
 namespace Grid {
  // Default precision
--- a/lib/perfmon/Stat.cc
+++ b/lib/perfmon/Stat.cc
@@ -1,9 +1,11 @@
-#include <Grid/GridCore.h>
+#include <Grid/Grid.h>
-#include <Grid/perfmon/PerfCount.h>
+#include <Grid/PerfCount.h>
-#include <Grid/perfmon/Stat.h>
+#include <Grid/Stat.h>
 namespace Grid { 
 bool PmuStat::pmu_initialized=false;
--- a/lib/perfmon/Stat.h
+++ b/lib/perfmon/Stat.h
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
--- a/lib/tensors/Tensors.h
+++ b/lib/tensors/Tensors.h
--- a/lib/threads/Threads.h
+++ b/lib/threads/Threads.h
@@ -37,9 +37,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>
-
+#ifdef GRID_NUMA
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
 #else
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
 #endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_CRITICAL     _Pragma("omp critical")
@@ -51,9 +55,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define PARALLEL_CRITICAL
 #endif
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 namespace Grid {
  // Introduce a class to gain deterministic bit reproducible reduction.
--- a/lib/perfmon/Timer.h
+++ b/lib/perfmon/Timer.h
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -267,7 +267,8 @@ namespace Grid {
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,compressor);
-      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+PARALLEL_FOR_LOOP
      for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
 	int ptype;
@@ -379,7 +380,8 @@ namespace Grid {
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
-	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+PARALLEL_FOR_LOOP
 	  for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
 		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
@@ -425,7 +427,7 @@ namespace Grid {
 	A[p]=zero;
      }
-      GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
+      GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice();
      Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
      Complex one(1.0);
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -235,7 +235,7 @@ namespace Grid {
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,tmp);
-        _Mat.MooeeInvDag(tmp,out);
+	_Mat.MooeeInvDag(tmp,out);
 	_Mat.MeooeDag(out,tmp);
 	_Mat.MooeeDag(in,out);
--- a/lib/algorithms/approx/.dirstamp
+++ b/lib/algorithms/approx/.dirstamp
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -197,9 +197,8 @@ namespace Grid {
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
-
+//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
-      // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
+//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      int vol=grid->gSites();
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Chulwoo Jung	1e3fb32572	Checking in working version of Lanczos.	2017-03-21 16:45:33 -04:00
Chulwoo Jung	0d5af667d8	Works with CPS evolution	2017-03-21 12:40:43 -04:00
Chulwoo Jung	e9712bc7fb	Zmobius test was wrong! (only mobius) checking in again	2017-03-16 23:04:28 -04:00