Merge pull request #116 from jch1g10/feature/qed-fvol

Feature/qed fvol
2025-06-30 21:47:08 +01:00 · 2017-09-25 15:08:33 +01:00
parent f6aa82b7f2 91676d1dda
commit 95e5a2ade3
131 changed files with 7933 additions and 5409 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -9,68 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
@ -106,9 +44,3 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/README.md
+++ b/README.md
@ -1,27 +1,44 @@
-# Grid
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 <table>
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
 **Data parallel C++ mathematical object library.**
 License: GPL v2.
-Last update Nov 2016.
+Last update June 2017.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
 * Identically shaped arrays then be processed with perfect data parallelisation.
 * Such identically shaped arrays are called conformable arrays.
 The transformation is based on the observation that Cartesian array processing involves
 identical processing to be performed on different regions of the Cartesian array.
 The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 Local vector loops are parallelised with OpenMP pragmas.
 Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 ### Compilers
 Intel ICPC v16.0.3 and later
@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 ### Required libraries
 Grid requires:
 [GMP](https://gmplib.org/), 
-### Description
+[MPFR](http://www.mpfr.org/) 
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
-* Identically shaped arrays then be processed with perfect data parallelisation.
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 * Such identically shaped arrays are called conformable arrays.
-The transformation is based on the observation that Cartesian array processing involves
+Grid optionally uses:
 identical processing to be performed on different regions of the Cartesian array.
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+[HDF5](https://support.hdfgroup.org/HDF5/)  
 Local vector loops are parallelised with OpenMP pragmas.
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
 Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+LAPACK either generic version or Intel MKL library.
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
 ### Quick start
 First, start by cloning the repository:
@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 Alternatively, some CPU codenames can be directly used:
@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
-             --enable-comms=mpi-auto \
+             --enable-comms=mpi-auto  \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
-where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi       \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Knight's Landing with Intel Omnipath adapters with two adapters per node 
 presently performs better with use of more than one rank per node, using shared memory 
 for interior communication. This is the mpi3 communications implementation. 
 We recommend four ranks per node for best performance, but optimum is local volume dependent.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
 ```
 ### Build setup for Intel Haswell Xeon platform
 The following configuration is recommended for the Intel Haswell platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ```
        export I_MPI_PIN=1
 ```
 This is the default.
 ### Build setup for Intel Skylake Xeon platform
 The following configuration is recommended for the Intel Skylake platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ``` 
        export I_MPI_PIN=1
 ```
 This is the default. 
 #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 TBA
 ### Build setup for AMD EPYC / RYZEN
 The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
 are common. Each chip within the module exposes a separate NUMA domain.
 There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
 MPI-3 is recommended with the use of four ranks per socket,
 and 8 threads per rank. 
 The following configuration is recommended for the AMD EPYC platform.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
 This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
 It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
 shared memory to communicate within this node:
 mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
 Where omp_bind.sh does the following:
 ```
 #!/bin/bash
 numanode=` expr $PMI_RANK % 8 `
 basecore=`expr $numanode \* 16`
 core0=`expr $basecore + 0 `
 core1=`expr $basecore + 2 `
 core2=`expr $basecore + 4 `
 core3=`expr $basecore + 6 `
 core4=`expr $basecore + 8 `
 core5=`expr $basecore + 10 `
 core6=`expr $basecore + 12 `
 core7=`expr $basecore + 14 `
 export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
 echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 $@
 ```
 Performance:
 #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 TBA
 ### Build setup for BlueGene/Q
 To be written...
 ### Build setup for ARM Neon
 To be written...
 ### Build setup for laptops, other compilers, non-cluster builds
 Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
 and omit the enable-mkl flag. 
 Single node builds are enabled with 
 ```
            --enable-comms=none
 ```
 FFTW support that is not in the default search path may then enabled with
 ```
    --with-fftw=<installpath>
 ```
 BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
--- a/33
+++ b/33
@ -1,23 +1,32 @@
 TODO:
 ---------------
-Peter's work list:
+Large item work list:
-2)- Precision conversion and sort out localConvert      <-- 
+
-3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started 
+1)- BG/Q port and check
-4)- Binary I/O speed up & x-strips
+2)- Christoph's local basis expansion Lanczos
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
+3)- Precision conversion and sort out localConvert      <-- partial
-- Physical propagator interface
+
-- Conserved currents
+  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
-- GaugeFix into central location
+4)- Physical propagator interface
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
+5)- Conserved currents
-- HDCR resume
+6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 7)- HDCR resume
 Recent DONE 
 -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
-- Merge high precision reduction into develop        
+-- Merge high precision reduction into develop         <-- DONE
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- BlockCG, BCGrQ                                      <-- DONE
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
   -- slice* linalg routines for multiRHS, BlockCG    
 -----
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@ -0,0 +1,775 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 };
 Gamma::Algebra Gmu [] = {
  Gamma::Algebra::GammaX,
  Gamma::Algebra::GammaY,
  Gamma::Algebra::GammaZ,
  Gamma::Algebra::GammaT
 };
 struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
  //  int HugePages;
 };
 class Benchmark {
 public:
  static void Decomposition (void ) {
    int threads = GridThread::GetThreads();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  static void Comms(void)
  {
    int Nloop=200;
    int nmu=0;
    int maxlat=32;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();
    for(int lat=4;lat<=maxlat;lat+=4){
      for(int Ls=8;Ls<=8;Ls*=2){
 	std::vector<int> latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
 	RealD ppn = Nrank/Nnode;
 	std::vector<HalfSpinColourVectorD *> xbuf(8);
 	std::vector<HalfSpinColourVectorD *> rbuf(8);
 	Grid.ShmBufferFreeAll();
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	int ncomm;
 	double dbytes;
 	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
 	  double start=usecond();
 	  dbytes=0;
 	  ncomm=0;
 	  parallel_for(int dir=0;dir<8;dir++){
 	    double tbytes;
 	    int mu =dir % 4;
 	    if (mpi_layout[mu]>1 ) {
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
 		int comm_proc=1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      } else { 
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 						 (void *)&rbuf[dir][0], recv_from_rank,
 						 bytes,dir);
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      ncomm++;
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      dbytes+=tbytes;
 	    }
 	  }
 	  Grid.Barrier();
 	  double stop=usecond();
 	  t_time[i] = stop-start; // microseconds
 	}
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
 	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    }
    }    
    return;
  }
  static void Memory(void)
  {
    const int Nvec=8;
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
    typedef iVector<vReal,Nvec> Vec;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  uint64_t lmax=48;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      Vec rn ; random(sRNG,rn);
      LatticeVec z(&Grid); z=rn;
      LatticeVec x(&Grid); x=rn;
      LatticeVec y(&Grid); y=rn;
      double a=2.0;
      uint64_t Nloop=NLOOP;
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
  };
  static double DWF5(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (sFGrid); random(RNG5,src);
    LatticeFermion tmp   (sFGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    LatticeFermion r_eo  (sFGrid);
    LatticeFermion err   (sFGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 100;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	sFGrid->Barrier();
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 500;
 	sFGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	sDw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	sFGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 	sDw.Report();
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
  static double DWF(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (FGrid); random(RNG5,src);
    LatticeFermion ref   (FGrid);
    LatticeFermion tmp   (FGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    ////////////////////////////////////
    // Naive wilson implementation
    ////////////////////////////////////
    {
      LatticeGaugeField Umu5d(FGrid); 
      std::vector<LatticeColourMatrix> U(4,FGrid);
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 	}
      }
      ref = zero;
      for(int mu=0;mu<Nd;mu++){
 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
      }
      for(int mu=0;mu<Nd;mu++){
 	tmp = U[mu]*Cshift(src,mu+1,1);
 	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 	tmp =adj(U[mu])*src;
 	tmp =Cshift(tmp,mu+1,-1);
 	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      }
      ref = -0.5*ref;
    }
    LatticeFermion src_e (FrbGrid);
    LatticeFermion src_o (FrbGrid);
    LatticeFermion r_e   (FrbGrid);
    LatticeFermion r_o   (FrbGrid);
    LatticeFermion r_eo  (FGrid);
    LatticeFermion err   (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 200;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 1000;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	Dw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	Dw.Report();
 	Dw.DhopEO(src_o,r_e,DaggerNo);
 	Dw.DhopOE(src_e,r_o,DaggerNo);
 	setCheckerboard(r_eo,r_o);
 	setCheckerboard(r_eo,r_e);
 	err = r_eo-ref; 
 	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 	assert((norm2(err)<1.0e-4));
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 #endif
  Benchmark::Decomposition();
  int do_memory=1;
  int do_comms =1;
  int do_su3   =0;
  int do_wilson=1;
  int do_dwf   =1;
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Memory();
  }
  if ( do_comms ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Comms();
  }
  if ( do_su3 ) {
    // empty for now
  }
  int sel=2;
  std::vector<int> L_list({8,12,16,24});
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> dwf5;
  if ( do_wilson ) {
    int Ls=1;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      wilson.push_back(Benchmark::DWF(1,L_list[l]));
    }
  }
  int Ls=16;
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
    }
  }
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
    }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  int NN=NN_global;
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Comparison point result: "  << dwf4[sel]/NN <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -66,9 +66,9 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-  int Nloop=500;
+  int Nloop=100;
  int nmu=0;
-  int maxlat=24;
+  int maxlat=32;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -88,12 +88,20 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      for(int i=0;i<Nloop;i++){
      double start=usecond();
@ -109,7 +117,6 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@ -132,13 +139,13 @@ int main (int argc, char ** argv)
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
-  double stop=usecond();
+	double stop=usecond();
-  t_time[i] = stop-start; // microseconds
+	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
-      double dbytes    = bytes;
+      double dbytes    = bytes*ppn;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
@ -160,15 +167,23 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@ -213,14 +228,14 @@ int main (int argc, char ** argv)
 	  }
 	}
 	Grid.Barrier();
-      double stop=usecond();
+	double stop=usecond();
-    t_time[i] = stop-start; // microseconds
+	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
-      double dbytes    = bytes;
+      double dbytes    = bytes*ppn;
      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
@ -243,7 +258,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -251,6 +266,9 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
@ -258,59 +276,66 @@ int main (int argc, char ** argv)
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
-      double start=usecond();
+	double start=usecond();
 	dbytes=0;
 	ncomm=0;
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
+	    dbytes+=
-					    (void *)&xbuf[mu][0],
+	      Grid.StencilSendToRecvFromBegin(requests,
-					    xmit_to_rank,
+					      (void *)&xbuf[mu][0],
-					    (void *)&rbuf[mu][0],
+					      xmit_to_rank,
-					    recv_from_rank,
+					      (void *)&rbuf[mu][0],
-					    bytes);
+					      recv_from_rank,
 					      bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
+	    dbytes+=
-					    (void *)&xbuf[mu+4][0],
+	      Grid.StencilSendToRecvFromBegin(requests,
-					    xmit_to_rank,
+					      (void *)&xbuf[mu+4][0],
-					    (void *)&rbuf[mu+4][0],
+					      xmit_to_rank,
-					    recv_from_rank,
+					      (void *)&rbuf[mu+4][0],
-					    bytes);
+					      recv_from_rank,
 					      bytes,mu+4);
 	  }
 	}
-	Grid.StencilSendToRecvFromComplete(requests);
+	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
-      double stop=usecond();
+	double stop=usecond();
-    t_time[i] = stop-start; // microseconds
+	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
-      double dbytes    = bytes;
+      dbytes=dbytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
+      double xbytes    = dbytes*0.5;
-      double rbytes    = xbytes;
+      double rbytes    = dbytes*0.5;
-      double bidibytes = xbytes+rbytes;
+      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
@ -330,7 +355,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@ -338,6 +363,9 @@ int main (int argc, char ** argv)
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
@ -345,16 +373,18 @@ int main (int argc, char ** argv)
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-
+      double dbytes;
      for(int i=0;i<Nloop;i++){
-      double start=usecond();
+	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
-
+	dbytes=0;
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@ -366,41 +396,43 @@ int main (int argc, char ** argv)
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
+	    dbytes+=
-					    (void *)&xbuf[mu][0],
+	      Grid.StencilSendToRecvFromBegin(requests,
-					    xmit_to_rank,
+					      (void *)&xbuf[mu][0],
-					    (void *)&rbuf[mu][0],
+					      xmit_to_rank,
-					    recv_from_rank,
+					      (void *)&rbuf[mu][0],
-					    bytes);
+					      recv_from_rank,
-	    Grid.StencilSendToRecvFromComplete(requests);
+					      bytes,mu);
 	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    Grid.StencilSendToRecvFromBegin(requests,
+	    dbytes+=
-					    (void *)&xbuf[mu+4][0],
+	      Grid.StencilSendToRecvFromBegin(requests,
-					    xmit_to_rank,
+					      (void *)&xbuf[mu+4][0],
-					    (void *)&rbuf[mu+4][0],
+					      xmit_to_rank,
-					    recv_from_rank,
+					      (void *)&rbuf[mu+4][0],
-					    bytes);
+					      recv_from_rank,
-	    Grid.StencilSendToRecvFromComplete(requests);
+					      bytes,mu+4);
 	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
 	}
-	    Grid.Barrier();
+	Grid.Barrier();
-      double stop=usecond();
+	double stop=usecond();
-      t_time[i] = stop-start; // microseconds
+	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
-      double dbytes    = bytes;
+      dbytes=dbytes*ppn;
-      double xbytes    = dbytes*2.0*ncomm;
+      double xbytes    = dbytes*0.5;
-      double rbytes    = xbytes;
+      double rbytes    = dbytes*0.5;
-      double bidibytes = xbytes+rbytes;
+      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
@ -413,5 +445,97 @@ int main (int argc, char ** argv)
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	parallel_for(int dir=0;dir<8;dir++){
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    if ( dir == mu ) { 
 	      int comm_proc=1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    } else { 
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 #pragma omp atomic
 	    dbytes+=tbytes;
 	  }
 	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -165,7 +165,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
+  int ncall =500;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
@ -303,6 +303,7 @@ int main (int argc, char ** argv)
    }
    assert(sum < 1.0e-4);
    if(1){
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
@ -381,8 +382,23 @@ int main (int argc, char ** argv)
      }
      assert(error<1.0e-4);
    }
  if(0){
    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      sDw.Dhop(ssrc,sresult,0);
      PerformanceCounter Counter(i);
      Counter.Start();
      sDw.Dhop(ssrc,sresult,0);
      Counter.Stop();
      Counter.Report();
    }
  }
  }
  if (1)
  { // Naive wilson dag implementation
    ref = zero;
@ -487,9 +503,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  //assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_e)<1.0e-4);
-  //assert(norm2(src_o)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@ -55,21 +55,21 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=44;
+  uint64_t lmax=96;
-#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
+#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      uint64_t Nloop=NLOOP;
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      double a=2.0;
@ -83,7 +83,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@ -94,17 +94,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      double a=2.0;
      uint64_t Nloop=NLOOP;
@ -119,7 +119,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@ -129,20 +129,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      RealD a=2.0;
@ -154,7 +154,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=2*vol*Nvec*sizeof(Real);
+      double bytes=2.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
@ -166,17 +166,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      RealD a=2.0;
      Real nn;      
      double start=usecond();
@ -187,7 +187,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=vol*Nvec*sizeof(Real);
+      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@ -35,14 +35,14 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-#define LMAX (32)
+#define LMAX (64)
-  int Nloop=200;
+  int64_t Nloop=20;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  int threads = GridThread::GetThreads();
+  int64_t threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@ -54,16 +54,16 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid);// random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid);// random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid);// random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	x=x*y;
      }
      double stop=usecond();
@ -86,17 +86,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	z=x*y;
      }
      double stop=usecond();
@ -117,17 +117,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
      }
      double stop=usecond();
@ -148,17 +148,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mac(z,x,y);
      }
      double stop=usecond();
--- a/configure.ac
+++ b/configure.ac
@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 ################ Set flags
 # do not move!
 CXXFLAGS="-O3 $CXXFLAGS"
 ############### Checks for programs
 AC_PROG_CXX
 AC_PROG_RANLIB
@ -27,7 +31,6 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
 CXXFLAGS="-O3 $CXXFLAGS"
 ############### Checks for typedefs, structures, and compiler characteristics
@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 ############## Standard libraries
 AC_CHECK_LIB([m],[cos])
 AC_CHECK_LIB([stdc++],[abort])
 ############### GMP and MPFR
 AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
@ -184,6 +192,15 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
 In order to use ILGG file format please install or provide the correct path to your installation
 Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 AC_SEARCH_LIBS([crc32], [z],
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 AC_SEARCH_LIBS([move_pages], [numa],
               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
 	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
@ -237,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-march=knl';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@ -244,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      NEONv8)
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
        SIMD_FLAGS='-march=armv8-a';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
@ -272,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@ -320,14 +342,14 @@ case ${ac_COMMS} in
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
     mpi3l*)
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
       comms_type='mpi3l'
     ;;
     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpit)
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
        comms_type='mpit'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
@ -355,7 +377,7 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@ -8,6 +8,7 @@
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 #include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
@ -16,10 +17,10 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ScalarVP.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 #include <Grid/Hadrons/Modules/Quark.hpp>
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@ -1,34 +1,5 @@
-/*************************************************************************************
+#ifndef Hadrons_MFermion_GaugeProp_hpp_
-
+#define Hadrons_MFermion_GaugeProp_hpp_
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/Quark.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Quark_hpp_
 #define Hadrons_Quark_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                               TQuark                                       *
+ *                                GaugeProp                                   *
 ******************************************************************************/
-class QuarkPar: Serializable
+BEGIN_MODULE_NAMESPACE(MFermion)
 class GaugePropPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
                                    std::string, source,
                                    std::string, solver);
 };
 template <typename FImpl>
-class TQuark: public Module<QuarkPar>
+class TGaugeProp: public Module<GaugePropPar>
 {
 public:
    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
-    TQuark(const std::string name);
+    TGaugeProp(const std::string name);
    // destructor
-    virtual ~TQuark(void) = default;
+    virtual ~TGaugeProp(void) = default;
-    // dependencies/products
+    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
@ -69,20 +42,20 @@ private:
    SolverFn     *solver_{nullptr};
 };
-MODULE_REGISTER(Quark, TQuark<FIMPL>);
+MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 /******************************************************************************
- *                          TQuark implementation                             *
+ *                      TGaugeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TQuark<FImpl>::TQuark(const std::string name)
+TGaugeProp<FImpl>::TGaugeProp(const std::string name)
-: Module(name)
+: Module<GaugePropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getInput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().solver};
@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void)
 }
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getOutput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName(), getName() + "_5d"};
@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::setup(void)
+void TGaugeProp<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().solver);
    env().template registerLattice<PropagatorField>(getName());
@ -111,13 +84,13 @@ void TQuark<FImpl>::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::execute(void)
+void TGaugeProp<FImpl>::execute(void)
 {
    LOG(Message) << "Computing quark propagator '" << getName() << "'"
-                 << std::endl;
+    << std::endl;
    FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
-                    tmp(env().getGrid());
+    tmp(env().getGrid());
    std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
    PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
@ -128,12 +101,12 @@ void TQuark<FImpl>::execute(void)
    }
    LOG(Message) << "Inverting using solver '" << par().solver
-                 << "' on source '" << par().source << "'" << std::endl;
+    << "' on source '" << par().source << "'" << std::endl;
    for (unsigned int s = 0; s < Ns; ++s)
    for (unsigned int c = 0; c < Nc; ++c)
    {
        LOG(Message) << "Inversion for spin= " << s << ", color= " << c
-                     << std::endl;
+        << std::endl;
        // source conversion for 4D sources
        if (!env().isObject5d(par().source))
        {
@ -170,7 +143,7 @@ void TQuark<FImpl>::execute(void)
        if (Ls_ > 1)
        {
            PropagatorField &p4d =
-                *env().template getObject<PropagatorField>(getName());
+            *env().template getObject<PropagatorField>(getName());
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void)
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Quark_hpp_
+#endif // Hadrons_MFermion_GaugeProp_hpp_
--- a/extras/Hadrons/Modules/MGauge/Load.cc
+++ b/extras/Hadrons/Modules/MGauge/Load.cc
@ -65,7 +65,7 @@ void TLoad::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 void TLoad::execute(void)
 {
-    NerscField  header;
+    FieldMetaData  header;
    std::string fileName = par().file + "."
                           + std::to_string(env().getTrajectory());
@ -74,5 +74,5 @@ void TLoad::execute(void)
    LatticeGaugeField &U = *env().createLattice<LatticeGaugeField>(getName());
    NerscIO::readConfiguration(U, header, fileName);
    LOG(Message) << "NERSC header:" << std::endl;
-    dump_nersc_header(header, LOG(Message));
+    dump_meta_data(header, LOG(Message));
 }
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
@ -23,7 +23,8 @@ std::vector<std::string> TChargedProp::getInput(void)
 std::vector<std::string> TChargedProp::getOutput(void)
 {
-    std::vector<std::string> out = {getName()};
+    std::vector<std::string> out = {getName(), getName()+"_Q",
                                    getName()+"_Sun", getName()+"_Tad"};
    return out;
 }
@ -38,6 +39,10 @@ void TChargedProp::setup(void)
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
    GFSrcName_ = "_" + getName() + "_DinvSrc";
    prop0Name_ = getName() + "_0";
    propQName_ = getName() + "_Q";
    propSunName_ = getName() + "_Sun";
    propTadName_ = getName() + "_Tad";
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
@ -53,7 +58,14 @@ void TChargedProp::setup(void)
    {
        env().registerLattice<ScalarField>(GFSrcName_);
    }
    if (!env().hasRegisteredObject(prop0Name_))
    {
        env().registerLattice<ScalarField>(prop0Name_);
    }
    env().registerLattice<ScalarField>(getName());
    env().registerLattice<ScalarField>(propQName_);
    env().registerLattice<ScalarField>(propSunName_);
    env().registerLattice<ScalarField>(propTadName_);
 }
 // execution ///////////////////////////////////////////////////////////////////
@ -64,7 +76,7 @@ void TChargedProp::execute(void)
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
-    // cache free scalar propagator
+    // cache momentum-space free scalar propagator
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
@ -88,6 +100,17 @@ void TChargedProp::execute(void)
    {
        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
    }
    // cache position-space free scalar propagator
    if (!env().hasCreatedObject(prop0Name_))
    {
        prop0_ = env().createLattice<ScalarField>(prop0Name_);
        *prop0_ = *GFSrc_;
        fft.FFT_all_dim(*prop0_, *prop0_, FFT::backward);
    }
    else
    {
        prop0_ = env().getObject<ScalarField>(prop0Name_);
    }
    // cache phases
    if (!env().hasCreatedObject(phaseName_[0]))
    {
@ -117,52 +140,137 @@ void TChargedProp::execute(void)
                 << ", charge= " << par().charge << ")..." << std::endl;
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField &propQ   = *env().createLattice<ScalarField>(propQName_);
    ScalarField &propSun   = *env().createLattice<ScalarField>(propSunName_);
    ScalarField &propTad   = *env().createLattice<ScalarField>(propTadName_);
    ScalarField buf(env().getGrid());
    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
    double      q = par().charge;
-    // G*F*Src
+    // -G*momD1*G*F*Src (momD1 = F*D1*Finv)
    prop = GFSrc;
    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
    buf = GFSrc;
    momD1(buf, fft);
-    buf = G*buf;
+    buf = -G*buf;
-    prop = prop - q*buf;
+    fft.FFT_all_dim(propQ, buf, FFT::backward);
-    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
+    // G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
    buf = -buf;
    momD1(buf, fft);
-    prop = prop + q*q*G*buf;
+    propSun = G*buf;
    fft.FFT_all_dim(propSun, propSun, FFT::backward);
-    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
+    // -G*momD2*G*F*Src (momD2 = F*D2*Finv)
    buf = GFSrc;
    momD2(buf, fft);
-    prop = prop - q*q*G*buf;
+    buf = -G*buf;
    fft.FFT_all_dim(propTad, buf, FFT::backward);
-    // final FT
+    // full charged scalar propagator
-    fft.FFT_all_dim(prop, prop, FFT::backward);
+    prop = (*prop0_) + q*propQ + q*q*propSun + q*q*propTad;
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
-        std::string           filename = par().output + "." +
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
            std::string           filename = par().output + "_" + std::to_string(mom[0])
                                                          + std::to_string(mom[1])
                                                          + std::to_string(mom[2])
                                                          + "." +
                                         std::to_string(env().getTrajectory());
-        LOG(Message) << "Saving zero-momentum projection to '"
+            LOG(Message) << "Saving (" << par().outputMom[i_p] << ") momentum projection to '"
                     << filename << "'..." << std::endl;
-        CorrWriter            writer(filename);
+            CorrWriter            writer(filename);
-        std::vector<TComplex> vecBuf;
+            std::vector<TComplex> vecBuf;
-        std::vector<Complex>  result;
+            std::vector<Complex>  result;
-        sliceSum(prop, vecBuf, Tp);
+            write(writer, "charge", q);
-        result.resize(vecBuf.size());
+            write(writer, "mass", par().mass);
-        for (unsigned int t = 0; t < vecBuf.size(); ++t)
+
-        {
+            // Write full propagator
-            result[t] = TensorRemove(vecBuf[t]);
+            buf = prop;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                {
                    buf = buf*adj(*phase_[j]);
                }
            }
            sliceSum(buf, vecBuf, Tp);
            result.resize(vecBuf.size());
            for (unsigned int t = 0; t < vecBuf.size(); ++t)
            {
                result[t] = TensorRemove(vecBuf[t]);
            }
            write(writer, "prop", result);
            // Write free propagator
            buf = *prop0_;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                {
                    buf = buf*adj(*phase_[j]);
                }
            }
            sliceSum(buf, vecBuf, Tp);
            for (unsigned int t = 0; t < vecBuf.size(); ++t)
            {
                result[t] = TensorRemove(vecBuf[t]);
            }
            write(writer, "prop_0", result);
            // Write propagator O(q) term
            buf = propQ;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                {
                    buf = buf*adj(*phase_[j]);
                }
            }
            sliceSum(buf, vecBuf, Tp);
            for (unsigned int t = 0; t < vecBuf.size(); ++t)
            {
                result[t] = TensorRemove(vecBuf[t]);
            }
            write(writer, "prop_Q", result);
            // Write propagator sunset term
            buf = propSun;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                {
                    buf = buf*adj(*phase_[j]);
                }
            }
            sliceSum(buf, vecBuf, Tp);
            for (unsigned int t = 0; t < vecBuf.size(); ++t)
            {
                result[t] = TensorRemove(vecBuf[t]);
            }
            write(writer, "prop_Sun", result);
            // Write propagator tadpole term
            buf = propTad;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                {
                    buf = buf*adj(*phase_[j]);
                }
            }
            sliceSum(buf, vecBuf, Tp);
            for (unsigned int t = 0; t < vecBuf.size(); ++t)
            {
                result[t] = TensorRemove(vecBuf[t]);
            }
            write(writer, "prop_Tad", result);
        }
        write(writer, "charge", q);
        write(writer, "prop", result);
    }
 }
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
@ -20,7 +20,8 @@ public:
                                    std::string, source,
                                    double,      mass,
                                    double,      charge,
-                                    std::string, output);
+                                    std::string, output,
                                    std::vector<std::string>, outputMom);
 };
 class TChargedProp: public Module<ChargedPropPar>
@ -45,9 +46,10 @@ private:
    void momD1(ScalarField &s, FFT &fft);
    void momD2(ScalarField &s, FFT &fft);
 private:
-    std::string                freeMomPropName_, GFSrcName_;
+    std::string                freeMomPropName_, GFSrcName_, prop0Name_,
                               propQName_, propSunName_, propTadName_;
    std::vector<std::string>   phaseName_;
-    ScalarField                *freeMomProp_, *GFSrc_;
+    ScalarField                *freeMomProp_, *GFSrc_, *prop0_;
    std::vector<ScalarField *> phase_;
    EmField                    *A;
 };
--- a/extras/Hadrons/Modules/MScalar/ScalarVP.cc
+++ b/extras/Hadrons/Modules/MScalar/ScalarVP.cc
@ -0,0 +1,588 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ScalarVP.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                  TScalarVP implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TScalarVP::TScalarVP(const std::string name)
 : Module<ScalarVPPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TScalarVP::getInput(void)
 {
    propQName_ = par().scalarProp + "_Q";
    propSunName_ = par().scalarProp + "_Sun";
    propTadName_ = par().scalarProp + "_Tad";
 	std::vector<std::string> in = {par().emField, propQName_, propSunName_,
                                   propTadName_};
    return in;
 }
 std::vector<std::string> TScalarVP::getOutput(void)
 {
    std::vector<std::string> out;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        out.push_back(getName() + "_propQ_" + std::to_string(mu));
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            out.push_back(getName() + "_" + std::to_string(mu) + "_" + std::to_string(nu));
            out.push_back(getName() + "_free_" + std::to_string(mu) + "_" + std::to_string(nu));
        }
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TScalarVP::setup(void)
 {
 	freeMomPropName_ = FREEMOMPROP(static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().mass);
 	GFSrcName_ = "_" + par().scalarProp + "_DinvSrc";
    prop0Name_ = par().scalarProp + "_0";
 	phaseName_.clear();
 	muPropQName_.clear();
    vpTensorName_.clear();
    freeVpTensorName_.clear();
 	for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
        muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu));
        std::vector<std::string> vpTensorName_mu;
        std::vector<std::string> freeVpTensorName_mu;
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu)
                                      + "_" + std::to_string(nu));
            freeVpTensorName_mu.push_back(getName() + "_free_" + std::to_string(mu)
                                       + "_" + std::to_string(nu));
        }
        vpTensorName_.push_back(vpTensorName_mu);
        freeVpTensorName_.push_back(freeVpTensorName_mu);
    }
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 	{
 	    env().registerLattice<ScalarField>(muPropQName_[mu]);
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            env().registerLattice<ScalarField>(vpTensorName_[mu][nu]);
            env().registerLattice<ScalarField>(freeVpTensorName_[mu][nu]);
        }
 	}
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TScalarVP::execute(void)
 {
    // Get objects cached by ChargedProp module
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    Real        q = static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().charge;
    freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
    }
    GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
    prop0_ = env().getObject<ScalarField>(prop0Name_);
    // Propagator from unshifted source
 	ScalarField &propQ   = *env().getObject<ScalarField>(propQName_);
    ScalarField &propSun   = *env().getObject<ScalarField>(propSunName_);
    ScalarField &propTad   = *env().getObject<ScalarField>(propTadName_);
    // Propagators from shifted sources
    LOG(Message) << "Computing O(q) charged scalar propagators..."
                 << std::endl;
    std::vector<ScalarField> muPropQ;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        muPropQ.push_back(*env().createLattice<ScalarField>(muPropQName_[mu]));
        // -G*momD1*G*F*tau_mu*Src (momD1 = F*D1*Finv)
        muPropQ[mu] = adj(*phase_[mu])*(*GFSrc_);
        momD1(muPropQ[mu], fft);
        muPropQ[mu] = -(*freeMomProp_)*muPropQ[mu];
        fft.FFT_all_dim(muPropQ[mu], muPropQ[mu], FFT::backward);
    }
    // CONTRACTIONS
    ScalarField prop1(env().getGrid()), prop2(env().getGrid());
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField Amu(env().getGrid()), tmp_vp(env().getGrid());
    TComplex    Anu0;
    std::vector<int> coor0 = {0, 0, 0, 0};
    std::vector<std::vector<ScalarField> > vpTensor, freeVpTensor;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        std::vector<ScalarField> vpTensor_mu;
        std::vector<ScalarField> freeVpTensor_mu;
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            vpTensor_mu.push_back(*env().createLattice<ScalarField>(vpTensorName_[mu][nu]));
            freeVpTensor_mu.push_back(*env().createLattice<ScalarField>(freeVpTensorName_[mu][nu]));
        }
        vpTensor.push_back(vpTensor_mu);
        freeVpTensor.push_back(freeVpTensor_mu);
    }
    // Open output files if necessary
    std::vector<CorrWriter *> writer, writer0, writerD;
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
            std::string           filename = par().output + "_" + std::to_string(mom[0])
                                                          + std::to_string(mom[1])
                                                          + std::to_string(mom[2])
                                                          + "." +
                                             std::to_string(env().getTrajectory());
            std::string           filename0 = par().output + "_" + std::to_string(mom[0])
                                                           + std::to_string(mom[1])
                                                           + std::to_string(mom[2])
                                                           + "_free." +
                                              std::to_string(env().getTrajectory());
            std::string           filenameD = par().output + "_" + std::to_string(mom[0])
                                                           + std::to_string(mom[1])
                                                           + std::to_string(mom[2])
                                                           + "_diagrams." +
                                              std::to_string(env().getTrajectory());
            CorrWriter *writer_i = new CorrWriter(filename);
            writer.push_back(writer_i);
            CorrWriter *writer0_i = new CorrWriter(filename0);
            writer0.push_back(writer0_i);
            CorrWriter *writerD_i = new CorrWriter(filenameD);
            writerD.push_back(writerD_i);
            write(*writer[i_p], "charge", q);
            write(*writer[i_p], "mass", static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().mass);
            write(*writer0[i_p], "charge", 0.0);
            write(*writer0[i_p], "mass", static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().mass);
            write(*writerD[i_p], "charge", q);
            write(*writerD[i_p], "mass", static_cast<TChargedProp *>(env().getModule(par().scalarProp))->par().mass);
        }
    }
    std::vector<TComplex>   vecBuf;
    std::vector<Complex>    result;
    ScalarField vpPhase(env().getGrid());
    // Do contractions
    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
    {
        peekSite(Anu0, peekLorentz(A, nu), coor0);
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..."
                         << std::endl;
            Amu = peekLorentz(A, mu);
            // Free VP
            prop1 = *prop0_;
            prop2 = Cshift(*prop0_, nu, -1);
            freeVpTensor[mu][nu] = adj(prop2) * Cshift(prop1, mu, 1);
            freeVpTensor[mu][nu] -= Cshift(adj(prop2), mu, 1) * prop1;
            freeVpTensor[mu][nu] = 2.0*real(freeVpTensor[mu][nu]);
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = freeVpTensor[mu][nu];
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writer0[i_p],
                          "Pi_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // "Exchange" terms
            prop1 += q*propQ;
            prop2 += q*muPropQ[nu];
            tmp_vp = adj(prop2) * (1.0 + ci*q*Amu)
                     * Cshift(prop1, mu, 1) * (1.0 + ci*q*Anu0);
            tmp_vp -= Cshift(adj(prop2), mu, 1) * (1.0 - ci*q*Amu)
                      * prop1 * (1.0 + ci*q*Anu0);
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] = tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_exchange_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Subtract O(alpha^2) term
            prop1 = q*propQ;
            prop2 = q*muPropQ[nu];
            tmp_vp = Cshift(adj(prop2), mu, 1) * (-ci)*q*Amu
                     * prop1 * ci*q*Anu0;
            tmp_vp -= adj(prop2) * ci*q*Amu
                      * Cshift(prop1, mu, 1) * ci*q*Anu0;
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_alpha2_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Sunset from unshifted source
            prop1 = q*q*propSun;
            prop2 = Cshift(*prop0_, nu, -1);
            tmp_vp = adj(prop2) * Cshift(prop1, mu, 1);
            tmp_vp -= Cshift(adj(prop2), mu, 1) * prop1;
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_sunset_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Sunset from shifted source
            prop1 = Cshift(prop1, nu, -1);
            tmp_vp = Cshift(adj(*prop0_), mu, 1) * prop1;
            tmp_vp -= adj(*prop0_) * Cshift(prop1, mu, 1);
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_sunset_shifted_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Tadpole from unshifted source
            prop1 = q*q*propTad;
            prop2 = Cshift(*prop0_, nu, -1);
            tmp_vp = adj(prop2) * Cshift(prop1, mu, 1);
            tmp_vp -= Cshift(adj(prop2), mu, 1) * prop1;
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_tadpole_unshifted_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Tadpole from shifted source
            prop1 = Cshift(prop1, nu, -1);
            tmp_vp = Cshift(adj(*prop0_), mu, 1) * prop1;
            tmp_vp -= adj(*prop0_) * Cshift(prop1, mu, 1);
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_tadpole_shifted_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Source tadpole
            prop1 = *prop0_;
            tmp_vp = adj(prop2)
                     * Cshift(prop1, mu, 1)
                     * (-0.5)*q*q*Anu0*Anu0;
            tmp_vp -= Cshift(adj(prop2), mu, 1)
                      * prop1
                      * (-0.5)*q*q*Anu0*Anu0;
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_sourcetadpole_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Sink tadpole
            tmp_vp = adj(prop2)
                     * (-0.5)*q*q*Amu*Amu
                     * Cshift(prop1, mu, 1);
            tmp_vp -= Cshift(adj(prop2), mu, 1)
                      * (-0.5)*q*q*Amu*Amu
                      * prop1;
            tmp_vp = 2.0*real(tmp_vp);
            vpTensor[mu][nu] += tmp_vp;
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = tmp_vp;
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writerD[i_p],
                          "Pi_sinktadpole_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
            // Output if necessary
            if (!par().output.empty())
            {
                std::vector<int> mom;
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    mom = strToVec<int>(par().outputMom[i_p]);
                    vpPhase = vpTensor[mu][nu];
                    for (unsigned int j = 0; j < env().getNd()-1; ++j)
                    {
                        for (unsigned int momcount = 0; momcount < mom[j]; ++momcount)
                        {
                            vpPhase = vpPhase*adj(*phase_[j]);
                        }
                    }
                    sliceSum(vpPhase, vecBuf, Tp);
                    result.resize(vecBuf.size());
                    for (unsigned int t = 0; t < vecBuf.size(); ++t)
                    {
                        result[t] = TensorRemove(vecBuf[t]);
                    }
                    write(*writer[i_p],
                          "Pi_"+std::to_string(mu)+"_"+std::to_string(nu),
                          result);
                }
            }
        }
    }
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            delete writer[i_p];
            delete writer0[i_p];
            delete writerD[i_p];
        }
    }
 }
 void TScalarVP::momD1(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    Complex     ci(0.0,1.0);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result - ci*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + ci*adj(*phase_[mu])*buf;
    }
    s = result;
 }
--- a/extras/Hadrons/Modules/MScalar/ScalarVP.hpp
+++ b/extras/Hadrons/Modules/MScalar/ScalarVP.hpp
@ -0,0 +1,64 @@
 #ifndef Hadrons_MScalar_ScalarVP_hpp_
 #define Hadrons_MScalar_ScalarVP_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         ScalarVP                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class ScalarVPPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar,
                                    std::string, emField,
                                    std::string, scalarProp,
                                    std::string, output,
                                    std::vector<std::string>, outputMom);
 };
 class TScalarVP: public Module<ScalarVPPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TScalarVP(const std::string name);
    // destructor
    virtual ~TScalarVP(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void momD1(ScalarField &s, FFT &fft);
 private:
    std::string                                 freeMomPropName_, GFSrcName_,
                                                prop0Name_, propQName_,
                                                propSunName_, propTadName_;
    std::vector<std::string>                    phaseName_, muPropQName_;
    std::vector<std::vector<std::string> >      vpTensorName_,
                                                freeVpTensorName_;
    ScalarField                                 *freeMomProp_, *GFSrc_,
                                                *prop0_;
    std::vector<ScalarField *>                  phase_;
    EmField                                     *A;
 };
 MODULE_REGISTER_NS(ScalarVP, TScalarVP, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_ScalarVP_hpp_
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@ -7,7 +7,8 @@ modules_cc =\
  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Unit.cc \
  Modules/MScalar/ChargedProp.cc \
-  Modules/MScalar/FreeProp.cc
+  Modules/MScalar/FreeProp.cc \
  Modules/MScalar/ScalarVP.cc
 modules_hpp =\
  Modules/MAction/DWF.hpp \
@ -20,6 +21,7 @@ modules_hpp =\
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
  Modules/MFermion/GaugeProp.hpp \
  Modules/MGauge/Load.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/StochEm.hpp \
@ -28,11 +30,11 @@ modules_hpp =\
  Modules/MScalar/ChargedProp.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
  Modules/MScalar/ScalarVP.hpp \
  Modules/MSink/Point.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSource/Point.hpp \
  Modules/MSource/SeqGamma.hpp \
  Modules/MSource/Wall.hpp \
-  Modules/MSource/Z2.hpp \
+  Modules/MSource/Z2.hpp
  Modules/Quark.hpp
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -41,7 +41,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/utils/GaugeFix.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
 #endif
--- a/lib/GridStd.h
+++ b/lib/GridStd.h
@ -7,6 +7,7 @@
 #include <cassert>
 #include <complex>
 #include <vector>
 #include <string>
 #include <iostream>
 #include <iomanip>
 #include <random>
@ -18,6 +19,7 @@
 #include <ctime>
 #include <sys/time.h>
 #include <chrono>
 #include <zlib.h>
 ///////////////////
 // Grid config
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif
-if BUILD_COMMS_MPI3L
+if BUILD_COMMS_MPIT
-  extra_sources+=communicator/Communicator_mpi3_leader.cc
+  extra_sources+=communicator/Communicator_mpit.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ b/lib/algorithms/densematrix/DenseMatrix.h
@ -1,137 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/DenseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DENSE_MATRIX_H
 #define GRID_DENSE_MATRIX_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Matrix untils
    /////////////////////////////////////////////////////////////
 template<class T> using DenseVector = std::vector<T>;
 template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
 template<class T> void Size(DenseVector<T> & vec, int &N) 
 { 
  N= vec.size();
 }
 template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
 { 
  N= mat.size();
  M= mat[0].size();
 }
 template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
 { 
  int M; Size(mat,N,M);
  assert(N==M);
 }
 template<class T> void Resize(DenseVector<T > & mat, int N) { 
  mat.resize(N);
 }
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
    mat[i].resize(M);
  }
 }
 template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
  int N,M;
  Size(mat,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    mat[i][j] = val;
  }}
 }
 /** Transpose of a matrix **/
 template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
  int N,M;
  Size(mat,N,M);
  DenseMatrix<T> C; Resize(C,M,N);
  for(int i=0;i<M;i++){
  for(int j=0;j<N;j++){
    C[i][j] = mat[j][i];
  }} 
  return C;
 }
 /** Set DenseMatrix to unit matrix **/
 template<class T> void Unity(DenseMatrix<T> &A){
  int N;  SizeSquare(A,N);
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      if ( i==j ) A[i][j] = 1;
      else        A[i][j] = 0;
    } 
  } 
 }
 /** Add C * I to matrix **/
 template<class T>
 void PlusUnit(DenseMatrix<T> & A,T c){
  int dim;  SizeSquare(A,dim);
  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 }
 /** return the Hermitian conjugate of matrix **/
 template<class T>
 DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
  int dim; SizeSquare(mat,dim);
  DenseMatrix<T> C; Resize(C,dim,dim);
  for(int i=0;i<dim;i++){
    for(int j=0;j<dim;j++){
      C[i][j] = conj(mat[j][i]);
    } 
  } 
  return C;
 }
 /**Get a square submatrix**/
 template <class T>
 DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 {
  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
  for(int i = row_st; i<row_end; i++){
  for(int j = col_st; j<col_end; j++){
    H[i-row_st][j-col_st]=A[i][j];
  }}
  return H;
 }
 }
 #include "Householder.h"
 #include "Francis.h"
 #endif
--- a/lib/algorithms/densematrix/Francis.h
+++ b/lib/algorithms/densematrix/Francis.h
@ -1,525 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Francis.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef FRANCIS_H
 #define FRANCIS_H
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 //#include <timer.h>
 //#include <lapacke.h>
 //#include <Eigen/Dense>
 namespace Grid {
 template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 /**
  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
 H =
      x  x  x  x  x  x  x  x  x
      x  x  x  x  x  x  x  x  x
      0  x  x  x  x  x  x  x  x
      0  0  x  x  x  x  x  x  x
      0  0  0  x  x  x  x  x  x
      0  0  0  0  x  x  x  x  x
      0  0  0  0  0  x  x  x  x
      0  0  0  0  0  0  x  x  x
      0  0  0  0  0  0  0  x  x
 Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
 **/
 template <class T>
 int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  DenseMatrix<T> H = Hin; 
  int N ; SizeSquare(H,N);
  int M = N;
  Fill(evals,0);
  Fill(evecs,0);
  T s,t,x=0,y=0,z=0;
  T u,d;
  T apd,amd,bc;
  DenseVector<T> p(N,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
  DenseVector<int> trows(N,0);
  /// Check if the matrix is really hessenberg, if not abort
  RealD sth = 0;
  for(int j=0;j<N;j++){
    for(int i=j+2;i<N;i++){
      sth = abs(H[i][j]);
      if(sth > small){
 	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
 	exit(1);
      }
    }
  }
  do{
    std::cout << "Francis QR Step N = " << N << std::endl;
    /** Check for convergence
      x  x  x  x  x
      0  x  x  x  x
      0  0  x  x  x
      0  0  x  x  x
      0  0  0  0  x
      for this matrix l = 4
     **/
    do{
      l = Chop_subdiag(H,nrm,e,small);
      r = 0;    ///May have converged on more than one eval
      ///Single eval
      if(l == N-1){
        evals[e] = H[l][l];
        N--; e++; r++; it = 0;
      }
      ///RealD eval
      if(l == N-2){
        trows[l+1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l+1][l+1];
        amd = H[l][l] - H[l+1][l+1];
        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
        N-=2; e+=2; r++; it = 0;
      }
    } while(r>0);
    if(N ==0) break;
    DenseVector<T > ck; Resize(ck,3);
    DenseVector<T> v;   Resize(v,3);
    for(int m = N-3; m >= l; m--){
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0){
        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ///Starting vector implicit Q theorem
      else{
        s = (H[N-2][N-2] + H[N-1][N-1]);
        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      if(m == l) break;
      /** Some stupid thing from numerical recipies, seems to work**/
      // PAB.. for heaven's sake quote page, purpose, evidence it works.
      //       what sort of comment is that!?!?!?
      u=abs(H[m][m-1])*(abs(y)+abs(z));
      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
      if ((T)abs(u+d) == (T)abs(d) ){
 	l = m; break;
      }
      //if (u < small){l = m; break;}
    }
    if(it > 100000){
     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, 2, v, beta);
    Householder_mult<T >(H,v,beta,0,l,l+2,0);
    Householder_mult<T >(H,v,beta,0,l,l+2,1);
    ///Accumulate eigenvector
    Householder_mult<T >(P,v,beta,0,l,l+2,1);
    int sw = 0;      ///Are we on the last row?
    for(int k=l;k<N-2;k++){
      x = H[k+1][k];
      y = H[k+2][k];
      z = (T)0.0;
      if(k+3 <= N-1){
 	z = H[k+3][k];
      } else{
 	sw = 1; 
 	v[2] = (T)0.0;
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      normalize(ck);
      Householder_vector<T >(ck, 0, 2-sw, v, beta);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
      ///Accumulate eigenvector
      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp; Resize(tmp,N);
  for(int i=0;i<N;i++){
    tmp[i] = evals[N-i-1];
  } 
  evals = tmp;
  UTeigenvectors(H, trows, evals, evecs);
  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
  return tot_it;
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  /**
  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
  H =
  x  x  0  0  0  0
  x  x  x  0  0  0
  0  x  x  x  0  0
  0  0  x  x  x  0
  0  0  0  x  x  x
  0  0  0  0  x  x
  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
  return my_Wilkinson(Hin, evals, evecs, small, small);
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
 {
  int N; SizeSquare(Hin,N);
  int M = N;
  ///I don't want to modify the input but matricies must be passed by reference
  //Scale a matrix by its "norm"
  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
  DenseMatrix<T> H;  H = Hin;
  RealD Hnorm = abs(Norm(Hin));
  H = H * (1.0 / Hnorm);
  // TODO use openmp and memset
  Fill(evals,0);
  Fill(evecs,0);
  T s, t, x = 0, y = 0, z = 0;
  T u, d;
  T apd, amd, bc;
  DenseVector<T> p; Resize(p,N); Fill(p,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N);
  Unity(P);
  DenseVector<int> trows(N, 0);
  /// Check if the matrix is really symm tridiag
  RealD sth = 0;
  for(int j = 0; j < N; ++j)
  {
    for(int i = j + 2; i < N; ++i)
    {
      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
      {
 	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
 	std::cout << "Warning tridiagonalize and call again" << std::endl;
        // exit(1); // see what is going on
        //return;
      }
    }
  }
  do{
    do{
      //Jasper
      //Check if the subdiagonal term is small enough (<small)
      //if true then it is converged.
      //check start from H.dim - e - 1
      //How to deal with more than 2 are converged?
      //What if Chop_symm_subdiag return something int the middle?
      //--------------
      l = Chop_symm_subdiag(H,nrm, e, small);
      r = 0;    ///May have converged on more than one eval
      //Jasper
      //In this case
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  x  0
      // 0  0  0  x  x  0
      // 0  0  0  0  0  x  <- l
      //--------------
      ///Single eval
      if(l == N - 1)
      {
        evals[e] = H[l][l];
        N--;
        e++;
        r++;
        it = 0;
      }
      //Jasper
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  0  0
      // 0  0  0  0  x  x  <- l
      // 0  0  0  0  x  x
      //--------------
      ///RealD eval
      if(l == N - 2)
      {
        trows[l + 1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l + 1][ l + 1];
        amd = H[l][l] - H[l + 1][l + 1];
        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
        N -= 2;
        e += 2;
        r++;
        it = 0;
      }
    }while(r > 0);
    //Jasper
    //Already converged
    //--------------
    if(N == 0) break;
    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
    for(int m = N - 3; m >= l; m--)
    {
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0)
      {
        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
        x = H[m][m] - t;
        z = H[m + 1][m];
      } else {
      ///Starting vector implicit Q theorem
        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
 	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
        x = H[m][m] - t;
        z = H[m + 1][m];
      }
      //Jasper
      //why it is here????
      //-----------------------
      if(m == l)
        break;
      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
      if ((T)abs(u + d) == (T)abs(d))
      {
        l = m;
        break;
      }
    }
    //Jasper
    if(it > 1000000)
    {
      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    //
    T s, c;
    Givens_calc<T>(x, z, c, s);
    Givens_mult<T>(H, l, l + 1, c, -s, 0);
    Givens_mult<T>(H, l, l + 1, c,  s, 1);
    Givens_mult<T>(P, l, l + 1, c,  s, 1);
    //
    for(int k = l; k < N - 2; ++k)
    {
      x = H.A[k + 1][k];
      z = H.A[k + 2][k];
      Givens_calc<T>(x, z, c, s);
      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp(N);
  for(int i = 0; i < N; ++i)
    tmp[i] = evals[N-i-1];
  evals = tmp;
  //
  UTeigenvectors(H, trows, evals, evecs);
  //UTSymmEigenvectors(H, trows, evals, evecs);
  for(int i = 0; i < evals.size(); ++i)
  {
    evecs[i] = P * evecs[i];
    normalize(evecs[i]);
    evals[i] = evals[i] * Hnorm;
  }
  // // FIXME this is to test
  // Hin.write("evecs3", evecs);
  // Hin.write("evals3", evals);
  // // check rsd
  // for(int i = 0; i < M; i++) {
  //   vector<T> Aevec = Hin * evecs[i];
  //   RealD norm2(0.);
  //   for(int j = 0; j < M; j++) {
  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
  //   }
  // }
  return tot_it;
 }
 template <class T>
 void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
  /**
  turn a matrix A =
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  into
  x  x  x  x  x
  x  x  x  x  x
  0  x  x  x  x
  0  0  x  x  x
  0  0  0  x  x
  with householder rotations
  Slow.
  */
  int N ; SizeSquare(A,N);
  DenseVector<T > p; Resize(p,N); Fill(p,0);
  for(int k=start;k<N-2;k++){
    //cerr << "hess" << k << std::endl;
    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
    ///Accumulate eigenvector
    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
  }
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,k,l);
    }
    }*/
 }
 template <class T>
 void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
 ///Tridiagonalize a matrix
  int N; SizeSquare(A,N);
  Hess(A,Q,start);
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,l,k);
    }
    }*/
 }
 template <class T>
 void ForceTridiagonal(DenseMatrix<T> &A){
 ///Tridiagonalize a matrix
  int N ; SizeSquare(A,N);
  for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
      A[l][k]=0;
      A[k][l]=0;
    }
  }
 }
 template <class T>
 int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
  int N; SizeSquare(Ain,N);
  DenseMatrix<T > A; A = Ain;
  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
  Tri(A,Q,0);
  int it = my_Wilkinson<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 template <class T>
 int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_Wilkinson(Ain, evals, evecs, small);
 }
 template <class T>
 int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_SymmEigensystem(Ain, evals, evecs, small);
 }
 template <class T>
 int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 ///Solve a general eigensystem, not necessarily in tridiagonal form
  int N = Ain.dim;
  DenseMatrix<T > A(N); A = Ain;
  DenseMatrix<T > Q(N);Q.Unity();
  Hess(A,Q,0);
  int it = QReigensystem<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 }
 #endif
--- a/lib/algorithms/densematrix/Householder.h
+++ b/lib/algorithms/densematrix/Householder.h
@ -1,242 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Householder.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef HOUSEHOLDER_H
 #define HOUSEHOLDER_H
 #define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 namespace Grid {
 /** Comparison function for finding the max element in a vector **/
 template <class T> bool cf(T i, T j) { 
  return abs(i) < abs(j); 
 }
 /** 
 	Calculate a real Givens angle 
 **/
 template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
  RealD mz = (RealD)abs(z);
  if(mz==0.0){
    c = 1; s = 0;
  }
  if(mz >= (RealD)abs(y)){
    T t = -y/z;
    s = (T)1.0 / sqrt ((T)1.0 + t * t);
    c = s * t;
  } else {
    T t = -z/y;
    c = (T)1.0 / sqrt ((T)1.0 + t * t);
    s = c * t;
  }
 }
 template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
 {
  int q ; SizeSquare(A,q);
  if(dir == 0){
    for(int j=0;j<q;j++){
      T nu = A[i][j];
      T w  = A[k][j];
      A[i][j] = (c*nu + s*w);
      A[k][j] = (-s*nu + c*w);
    }
  }
  if(dir == 1){
    for(int j=0;j<q;j++){
      T nu = A[j][i];
      T w  = A[j][k];
      A[j][i] = (c*nu - s*w);
      A[j][k] = (s*nu + c*w);
    }
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	P | x |    | x | k = 0
 	| x |    | 0 | 
 	| x | =  | 0 |
 	| x |    | 0 | j = 3
 	| x |	   | x |
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
 {
  int N ; Size(input,N);
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
    else                 v[k] = -alpha;
  } else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	Px = alpha*e_dir
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
 {
  int N = input.size();
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
    else                  v[dir] = -alpha;
  }else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 **/
 template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
 {
  int N ; SizeSquare(A,N);
  if(abs(beta) > 0.0){
    for(int p=l; p<N; p++){
      T s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
      } else {
 	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
      }
    }
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 	A is tridiagonal
 **/
 template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
 {
  if(abs(beta) > 0.0){
    int N ; SizeSquare(A,N);
    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
    T s;
    for(int p=l; p<M; p++){
      s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
      }else{
 	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
      }
      s = beta*s;
      if(trans==0){
 	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
      }else{
 	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
      }
    }
    for(int p=l; p<M; p++){
      if(trans==0){
 	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
      }else{
 	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
      }
    }
  }
 }
 }
 #endif
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@ -33,6 +33,8 @@ directory
 namespace Grid {
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
@ -40,25 +42,280 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
-  const int blockDim = 0;
+  int blockDim ;
  int Nblock;
  BlockCGtype CGtype;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
-    MaxIterations(maxit),
+  {};
    ErrorOnNoConverge(err_on_no_conv){};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Cholesky from Eigen
  // There exists a ldlt that is documented as more stable
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
  // Q_j  = R_i Cinv(i,j) 
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // FIXME:: make a sliceMulMatrix to avoid zero vector
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
  } else if (CGtype == BlockCG ) {
    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
 //--------------------------
 // X is guess/Solution
 // B is RHS
 // Solve A X_i = B_i    ;        i refers to Nblock index
 ////////////////////////////////////////////////////////////////////////////
 void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  X.checkerboard = B.checkerboard;
  conformable(X, B);
  Field tmp(B);
  Field Q(B);
  Field D(B);
  Field Z(B);
  Field AD(B);
  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,X,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  /************************************************************************
   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
   ************************************************************************
   * Dimensions:
   *
   *   X,B==(Nferm x Nblock)
   *   A==(Nferm x Nferm)
   *  
   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
   * 
   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
   * for k: 
   *   Z  = AD
   *   M  = [D^dag Z]^{-1}
   *   X  = X + D MC
   *   QS = Q - ZM
   *   D  = Q + D S^dag
   *   C  = S C
   */
  ///////////////////////////////////////
  // Initial block: initial search dir is guess
  ///////////////////////////////////////
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    //3. Z  = AD
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
    sliceMaddTimer.Stop();
    //6. QS = Q - ZM
    sliceMaddTimer.Start();
    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
    sliceMaddTimer.Stop();
    QRTimer.Start();
    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
    QRTimer.Stop();
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
    //8. C  = S C
    m_C = m_S*m_C;
    /*********************
     * convergence monitor
     *********************
     */
    m_rr = m_C.adjoint() * m_C;
    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
    for(int b=0;b<Nblock;b++) {
      rrsum+=real(m_rr(b,b));
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(X, AD);
      AD = AD-B;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@ -162,8 +419,9 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
     *********************
     */
    RealD max_resid=0;
    RealD rr;
    for(int b=0;b<Nblock;b++){
-      RealD rr = real(m_rr(b,b))/ssq[b];
+      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
@ -173,13 +431,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@ -197,35 +456,13 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
-template <class Field>
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 class MultiRHSConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
  const int blockDim = 0;
  int Nblock;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
    MaxIterations(maxit),
    ErrorOnNoConverge(err_on_no_conv){};
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  int Orthog = blockDim; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@ -285,12 +522,10 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
    MatrixTimer.Stop();
    // Alpha
    //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
    sliceInnerTimer.Start();
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    for(int b=0;b<Nblock;b++){
      //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
    }
@ -332,7 +567,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
@ -358,9 +593,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 }
 #endif
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@ -1,81 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/EigenSort.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_EIGENSORT_H
 #define GRID_EIGENSORT_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Eigen sorter to begin with
    /////////////////////////////////////////////////////////////
 template<class Field>
 class SortEigen {
 private:
 //hacking for testing for now
 private:
  static bool less_lmd(RealD left,RealD right){
    return left > right;
  }  
  static bool less_pair(std::pair<RealD,Field const*>& left,
                        std::pair<RealD,Field const*>& right){
    return left.first > (right.first);
  }  
 public:
  void push(DenseVector<RealD>& lmd,
            DenseVector<Field>& evec,int N) {
    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
    for(int i=0;i<lmd.size();++i)
      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
      evec[i]=*(it->second);
      ++it;
    }
  }
  void push(DenseVector<RealD>& lmd,int N) {
    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
  }
  bool saturated(RealD lmd, RealD thrs) {
    return fabs(lmd) > fabs(thrs);
  }
 };
 }
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@ -1,7 +1,5 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 namespace Grid {
@ -11,7 +9,7 @@ int PointerCache::victim;
 void *PointerCache::Insert(void *ptr,size_t bytes) {
-  if (bytes < 4096 ) return NULL;
+  if (bytes < 4096 ) return ptr;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
  return NULL;
 }
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
      }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@ -64,6 +64,8 @@ namespace Grid {
  };
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
@ -92,18 +94,34 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
-
+    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
 #pragma omp parallel for
 #endif
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@ -182,10 +200,17 @@ public:
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    uint8_t *cp = (uint8_t *)ptr;
    // One touch per 4k page, static OMP loop to catch same loop order
 #pragma omp parallel for schedule(static)
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -50,7 +50,6 @@ public:
    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
@ -63,13 +62,12 @@ public:
    int _isites;
    int _fsites;                  // _isites*_osites = product(dimensions).
    int _gsites;
-    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_block;// subslice information
    std::vector<int> _slice_stride;
    std::vector<int> _slice_nblock;
-    // Might need these at some point
+    std::vector<int> _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
-    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
+    std::vector<int> _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
 public:
@ -176,6 +174,7 @@ public:
    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
    inline int Nd    (void) const { return _ndimension;};
    inline const std::vector<int> LocalStarts(void)             { return _lstart;    };
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
@ -186,17 +185,18 @@ public:
    ////////////////////////////////////////////////////////////////
    void show_decomposition(){
-      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
-      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
-      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
-      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
-      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
-      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
-      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
-      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
-      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
-      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
-      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 
    ////////////////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@ -62,73 +62,81 @@ public:
      return shift;
    }
    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
+                  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid
+                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
 		  ) : GridBase(processor_grid)
    {
-        ///////////////////////
+      ///////////////////////
-        // Grid information
+      // Grid information
-        ///////////////////////
+      ///////////////////////
-        _ndimension = dimensions.size();
+      _ndimension = dimensions.size();
-        _fdimensions.resize(_ndimension);
+      _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
-        _ostride.resize(_ndimension);
+      _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
+      _istride.resize(_ndimension);
-        _fsites = _gsites = _osites = _isites = 1;
+      _fsites = _gsites = _osites = _isites = 1;
-        for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
+      {
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _fdimensions[d] = dimensions[d];   // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _fsites = _fsites * _fdimensions[d];
+        _simd_layout[d] = simd_layout[d];
-	  _gsites = _gsites * _gdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	  //FIXME check for exact division
+        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
-	  // Use a reduced simd grid
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
 	  _osites *= _rdimensions[d];
 	  _isites *= _simd_layout[d];
-	  // Addressing support
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
-	  if ( d==0 ) {
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	    _ostride[d] = 1;
+        _osites *= _rdimensions[d];
-	    _istride[d] = 1;
+        _isites *= _simd_layout[d];
-	  } else {
+
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+        // Addressing support
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
+        if (d == 0)
-	  }
+        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
-        
+        else
-        ///////////////////////
+        {
-        // subplane information
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
-        ///////////////////////
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        _slice_block.resize(_ndimension);
        _slice_stride.resize(_ndimension);
        _slice_nblock.resize(_ndimension);
        int block =1;
        int nblock=1;
        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
        for(int d=0;d<_ndimension;d++){
            nblock/=_rdimensions[d];
            _slice_block[d] =block;
            _slice_stride[d]=_ostride[d]*_rdimensions[d];
            _slice_nblock[d]=nblock;
            block = block*_rdimensions[d];
        }
      }
      ///////////////////////
      // subplane information
      ///////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
      int block = 1;
      int nblock = 1;
      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
      for (int d = 0; d < _ndimension; d++)
      {
        nblock /= _rdimensions[d];
        _slice_block[d] = block;
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
    };
 };
 }
 #endif
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -131,71 +131,83 @@ public:
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
    }
    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
+              const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
+              const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
+              const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              int checker_dim)
    {
-    ///////////////////////
+      ///////////////////////
-    // Grid information
+      // Grid information
-    ///////////////////////
+      ///////////////////////
      _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
+      assert(checker_dim_mask.size() == _ndimension);
-      assert(processor_grid.size()==_ndimension);
+      assert(processor_grid.size() == _ndimension);
-      assert(simd_layout.size()==_ndimension);
+      assert(simd_layout.size() == _ndimension);
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
      _rdimensions.resize(_ndimension);
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
      _fsites = _gsites = _osites = _isites = 1;
-      _checker_dim_mask=checker_dim_mask;
+      _checker_dim_mask = checker_dim_mask;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	_fdimensions[d] = dimensions[d];
+      {
-	_gdimensions[d] = _fdimensions[d];
+        _fdimensions[d] = dimensions[d];
-	_fsites = _fsites * _fdimensions[d];
+        _gdimensions[d] = _fdimensions[d];
-	_gsites = _gsites * _gdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	if (d==_checker_dim) {
+        if (d == _checker_dim)
-	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
+        {
-	}
+          assert((_gdimensions[d] & 0x1) == 0);
-	_ldimensions[d] = _gdimensions[d]/_processors[d];
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
        }
        _ldimensions[d] = _gdimensions[d] / _processors[d];
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	// Use a reduced simd grid
+        // Use a reduced simd grid
-	_simd_layout[d] = simd_layout[d];
+        _simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
-	assert(_rdimensions[d]>0);
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
        assert(_rdimensions[d] > 0);
-	// all elements of a simd vector must have same checkerboard.
+        // all elements of a simd vector must have same checkerboard.
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
-	if ( _simd_layout[d]>1 ) {
+        if (_simd_layout[d] > 1)
-	  if ( checker_dim_mask[d] ) { 
+        {
-	    assert( (_rdimensions[d]&0x1) == 0 );
+          if (checker_dim_mask[d])
-	  }
+          {
-	}
+            assert((_rdimensions[d] & 0x1) == 0);
-
+          }
-	_osites *= _rdimensions[d];
+        }
 	_isites *= _simd_layout[d];
 	// Addressing support
 	if ( d==0 ) {
 	  _ostride[d] = 1;
 	  _istride[d] = 1;
 	} else {
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
      ////////////////////////////////////////////////////////////////////////////////////////////
@ -205,58 +217,69 @@ public:
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
-      int block =1;
+      int block = 1;
-      int nblock=1;
+      int nblock = 1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	nblock/=_rdimensions[d];
+      {
-	_slice_block[d] =block;
+        nblock /= _rdimensions[d];
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
+        _slice_block[d] = block;
-	_slice_nblock[d]=nblock;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
-	block = block*_rdimensions[d];
+        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	rvol=rvol * _rdimensions[d];
+      {
        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
+      for (int osite = 0; osite < _osites; osite++)
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      {
        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
    };
-protected:
+
  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
-      int idx=0;
+      int idx = 0;
-      for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	if( d==_checker_dim ) {
+      {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
+        if (d == _checker_dim)
-	} else {
+        {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
-	}
+        }
        else
        {
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
        }
      }
      return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
-        int idx=0;
+      int idx = 0;
-        for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	  if( d==_checker_dim ) {
+      {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+        if (d == _checker_dim)
-	  } else { 
+        {
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
-	  }
+        }
-	}
+        else
-        return idx;
+        {
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
        }
      }
      return idx;
    }
 };
 }
 #endif
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
@ -34,7 +38,10 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 int CartesianCommunicator::Hugepages = 0;
 /////////////////////////////////
 // Alloc, free shmem region
@ -60,6 +67,7 @@ void CartesianCommunicator::ShmBufferFreeAll(void) {
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::Dimensions(void)         { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
@ -88,24 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }
-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
+#if !defined( GRID_COMMS_MPI3) 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
-
+int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+#endif
-						       void *xmit,
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
-						       int xmit_to_rank,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       void *recv,
+						     int xmit_to_rank,
-						       int recv_from_rank,
+						     void *recv,
-						       int bytes)
+						     int recv_from_rank,
 						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 #endif
 #if !defined( GRID_COMMS_MPI3) 
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@ -119,8 +146,32 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
 #if !defined(MAP_ANONYMOUS)
  #define NO_MAP_ANONYMOUS
  #define MAP_ANONYMOUS MAP_ANON
 #endif
  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
 #ifdef MAP_HUGETLB
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 #endif
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 #if defined(NO_MAP_ANONYMOUS)
  #undef MAP_ANONYMOUS
  #undef NO_MAP_ANONYMOUS
 #endif
 }
 #endif
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
@ -50,12 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
  public:    
-  // 65536 ranks per node adequate for now
+
  ////////////////////////////////////////////
  // Isend/Irecv/Wait, or Sendrecv blocking
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  ///////////////////////////////////////////
  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-
+  ///////////////////////////////////////////
-  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const int MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
  // use explicit huge pages
  static int       Hugepages;
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@ -64,14 +76,18 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  static MPI_Comm communicator_world;
-         MPI_Comm communicator;
+
  MPI_Comm              communicator;
  std::vector<MPI_Comm> communicator_halo;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
@ -117,10 +133,6 @@ class CartesianCommunicator {
  /////////////////////////////////
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  size_t heap_top;
  size_t heap_bytes;
@ -148,6 +160,7 @@ class CartesianCommunicator {
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  int                      Dimensions(void)        ;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
@ -155,6 +168,7 @@ class CartesianCommunicator {
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  int                      NodeCount(void)    ;
  int                      RankCount(void)    ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@ -175,6 +189,8 @@ class CartesianCommunicator {
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
@ -207,14 +223,21 @@ class CartesianCommunicator {
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  double StencilSendToRecvFrom(void *xmit,
-				  void *xmit,
+			       int xmit_to_rank,
-				  int xmit_to_rank,
+			       void *recv,
-				  void *recv,
+			       int recv_from_rank,
-				  int recv_from_rank,
+			       int bytes,int dir);
 				  int bytes);
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
 				    void *recv,
 				    int recv_from_rank,
 				    int bytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -83,6 +83,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -37,11 +37,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
-#ifndef SHM_HUGETLB
+#ifdef HAVE_NUMAIF_H
-#define SHM_HUGETLB 04000
+#include <numaif.h>
 #endif
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@ -65,6 +66,7 @@ std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 int CartesianCommunicator::NodeCount(void)    { return GroupSize;};
 int CartesianCommunicator::RankCount(void)    { return WorldSize;};
 #undef FORCE_COMMS
@ -210,9 +212,34 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      int mmap_flag = MAP_SHARED;
 #ifdef MAP_HUGETLB
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
 // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
 #if 0
 //#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
 	int nodes=1; // numa domain == MCDRAM
 	// Find out if in SNC2,SNC4 mode ?
 #else
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
 	for(uint64_t page=0;page<size;page+=4096){
 	  void *pages = (void *) ( page + (uint64_t)ptr );
 	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
 	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 	  if (ierr && (page==0)) perror("numa relocate command failed");
 	}
 #endif
      ShmCommBufs[r] =ptr;
    }
@ -243,7 +270,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      key_t key   = 0x4545 + r;
-      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) < 0) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	perror("shmget");
@ -374,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@ -509,6 +546,14 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@ -590,13 +635,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  }
 }
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       void *xmit,
+						     int dest,
-						       int dest,
+						     void *recv,
-						       void *recv,
+						     int from,
-						       int from,
+						     int bytes,int dir)
 						       int bytes)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  assert(dir < communicator_halo.size());
  MPI_Request xrq;
  MPI_Request rrq;
@ -615,26 +674,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@ -0,0 +1,286 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  int Size; 
  MPI_Comm_size(communicator,&Size);
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  list.push_back(req[0]);
  list.push_back(req[1]);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
  int nreq=waitall.size();
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 };
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
 						    void *recv,
 						    int recv_from_rank,
 						    int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
 }
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -59,6 +59,8 @@ void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -1,4 +1,4 @@
- /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reduction.h
    Copyright (C) 2015
@ -369,71 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
 template<class vobj>
 static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			     int Orthog,RealD scale=1.0) 
 {    
  // FIXME: Implementation is slow
  // Best base the linear combination by constructing a 
  // set of vectors of size grid->_rdimensions[Orthog].
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  Lattice<vobj> Xslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);
  // If we based this on Cshift it would work for spread out
  // but it would be even slower
  for(int i=0;i<Nblock;i++){
    ExtractSlice(Rslice,Y,i,Orthog);
    ExtractSlice(Xslice,X,i,Orthog);
    Rslice = Rslice + Xslice*(scale*a[i]);
    InsertSlice(Rslice,R,i,Orthog);
  }
 };
 template<class vobj>
 static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
  {
    // FIXME: Implementation is slow
    // Look at localInnerProduct implementation,
    // and do inside a site loop with block strided iterators
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    typedef typename vobj::tensor_reduced scalar;
    typedef typename scalar::scalar_object  scomplex;
    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
    vec.resize(Nblock);
    std::vector<scomplex> sip(Nblock);
    Lattice<scalar> IP(lhs._grid); 
    IP=localInnerProduct(lhs,rhs);
    sliceSum(IP,sip,Orthog);
    for(int ss=0;ss<Nblock;ss++){
      vec[ss] = TensorRemove(sip[ss]);
    }
  }
 */
 //////////////////////////////////////////////////////////////////////////////////////////
 // FIXME: Implementation is slow
 // If we based this on Cshift it would work for spread out
 // but it would be even slower
 //
 // Repeated extract slice is inefficient
 //
 // Best base the linear combination by constructing a 
 // set of vectors of size grid->_rdimensions[Orthog].
 //////////////////////////////////////////////////////////////////////////////////////////
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@ -452,7 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
-
+*/
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@ -464,55 +400,168 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
-  for(int i=0;i<Nblock;i++){
+  assert( FullGrid->_simd_layout[Orthog]==1);
-    ExtractSlice(Rslice,Y,i,Orthog);
+  int nh =  FullGrid->_ndimension;
-    for(int j=0;j<Nblock;j++){
+  //  int nl = SliceGrid->_ndimension;
-      ExtractSlice(Xslice,X,j,Orthog);
+  int nl = nh-1;
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
+
-    }
+  //FIXME package in a convenient iterator
-    InsertSlice(Rslice,R,i,Orthog);
+  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = s_x[0]*(scale*aa(0,i));
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  // FIXME: Implementation is slow
  // Not sure of best solution.. think about it
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  for(int i=0;i<Nblock;i++){
+  assert( FullGrid->_simd_layout[Orthog]==1);
-    ExtractSlice(Lslice,lhs,i,Orthog);
+  int nh =  FullGrid->_ndimension;
-    for(int j=0;j<Nblock;j++){
+  //  int nl = SliceGrid->_ndimension;
-      ExtractSlice(Rslice,rhs,j,Orthog);
+  int nl = nh-1;
-      mat(i,j) = innerProduct(Lslice,Rslice);
+
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
 #pragma omp parallel 
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	Left [i] = lhs[o+i*ostride];
 	Right[i] = rhs[o+i*ostride];
      }
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
 	//	vector_typeD rtmp = TensorRemove(tmp);
 	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
      }}
    }}
 #pragma omp critical
    {
      mat += mat_thread;
    }  
  }
-#undef FORCE_DIAG
+
 #ifdef FORCE_DIAG
  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
+  for(int j=0;j<Nblock;j++){
-      if ( i != j ) mat(i,j)=0.0;
+    ComplexD sum = mat(i,j);
-    }
+    FullGrid->GlobalSum(sum);
-  }
+    mat(i,j)=sum;
-#endif
+  }}
  return;
 }
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@ -551,7 +551,10 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
-typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
 unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
 {
  typedef typename vobj::vector_type vtype;
  GridBase* in_grid = in._grid;
@ -590,6 +593,54 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
    extract1(in_vobj, out_ptrs, 0);
  }
 }
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value 
                    && !isSIMDvectorized<sobj>::value, void>::type 
 vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
 {
  typedef typename vobj::vector_type vtype;
  GridBase* grid = out._grid;
  assert(in.size()==grid->lSites());
  int ndim     = grid->Nd();
  int nsimd    = vtype::Nsimd();
  std::vector<std::vector<int> > icoor(nsimd);
  for(int lane=0; lane < nsimd; lane++){
    icoor[lane].resize(ndim);
    grid->iCoorFromIindex(icoor[lane],lane);
  }
  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> ptrs(nsimd);
    std::vector<int> ocoor(ndim);
    grid->oCoorFromOindex(ocoor, oidx);
    std::vector<int> lcoor(grid->Nd());
    for(int lane=0; lane < nsimd; lane++){
      for(int mu=0;mu<ndim;mu++){
 	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
      }
      int lex;
      Lexicographic::IndexFromCoor(lcoor, lex, grid->_ldimensions);
      ptrs[lane] = &in[lex];
    }
    //pack from those ptrs
    vobj vecobj;
    merge1(vecobj, ptrs, 0);
    out._odata[oidx] = vecobj; 
  }
 }
 //Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>
@ -615,7 +666,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
-  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+  parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
    std::vector<int> out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@ -27,6 +27,7 @@ directory
 #ifndef GRID_ILDG_IO_H
 #define GRID_ILDG_IO_H
 #ifdef HAVE_LIME
 #include <algorithm>
 #include <fstream>
 #include <iomanip>
@ -37,213 +38,677 @@ directory
 #include <sys/utsname.h>
 #include <unistd.h>
-#ifdef HAVE_LIME
+//C-Lime is a must have for this functionality
-
+extern "C" {  
 extern "C" {  // for linkage
 #include "lime.h"
 }
 namespace Grid {
 namespace QCD {
-inline void ILDGGrid(GridBase *grid, ILDGField &header) {
+  /////////////////////////////////
-  assert(grid->_ndimension == 4);  // emit error if not
+  // Encode word types as strings
-  header.dimension.resize(4);
+  /////////////////////////////////
-  header.boundary.resize(4);
+ template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
-  for (int d = 0; d < 4; d++) {
+ template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
-    header.dimension[d] = grid->_fdimensions[d];
+ template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
-    // Read boundary conditions from ... ?
+ template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
-    header.boundary[d] = std::string("periodic");
+ template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
-  }
+ template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
-}
+ template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
-inline void ILDGChecksum(uint32_t *buf, uint32_t buf_size_bytes,
+  /////////////////////////////////////////
-                         uint32_t &csum) {
+  // Encode a generic tensor as a string
-  BinaryIO::Uint32Checksum(buf, buf_size_bytes, csum);
+  /////////////////////////////////////////
-}
+ template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
-//////////////////////////////////////////////////////////////////////
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
 // Utilities ; these are QCD aware
 //////////////////////////////////////////////////////////////////////
 template <class GaugeField>
 inline void ILDGStatistics(GaugeField &data, ILDGField &header) {
  // How to convert data precision etc...
  header.link_trace = Grid::QCD::WilsonLoops<PeriodicGimplR>::linkTrace(data);
  header.plaquette = Grid::QCD::WilsonLoops<PeriodicGimplR>::avgPlaquette(data);
  // header.polyakov =
 }
-// Forcing QCD here
+   int _ColourN       = indexRank<ColourIndex,vobj>();
-template <class fobj, class sobj>
+   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
-struct ILDGMunger {
+   int _ColourVector  =  isVector<ColourIndex,vobj>();
-  void operator()(fobj &in, sobj &out, uint32_t &csum) {
+   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
    for (int mu = 0; mu < 4; mu++) {
      for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
          out(mu)()(i, j) = in(mu)()(i, j);
        }
      }
    }
    ILDGChecksum((uint32_t *)&in, sizeof(in), csum);
  };
 };
-template <class fobj, class sobj>
+   int _SpinN       = indexRank<SpinIndex,vobj>();
-struct ILDGUnmunger {
+   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
-  void operator()(sobj &in, fobj &out, uint32_t &csum) {
+   int _SpinVector  =  isVector<SpinIndex,vobj>();
-    for (int mu = 0; mu < 4; mu++) {
+   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
      for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
          out(mu)()(i, j) = in(mu)()(i, j);
        }
      }
    }
    ILDGChecksum((uint32_t *)&out, sizeof(out), csum);
  };
 };
-////////////////////////////////////////////////////////////////////////////////
+   int _LorentzN       = indexRank<LorentzIndex,vobj>();
-// Write and read from fstream; compute header offset for payload
+   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
-////////////////////////////////////////////////////////////////////////////////
+   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
-enum ILDGstate {ILDGread, ILDGwrite};
+   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
-class ILDGIO : public BinaryIO {
+   std::stringstream stream;
-  FILE *File;
+
-  LimeWriter *LimeW;
+   stream << "GRID_";
-  LimeRecordHeader *LimeHeader;
+   stream << ScidacWordMnemonic<stype>();
-  LimeReader *LimeR;
+
-  std::string filename;
+   //   std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix<<std::endl;
   //   std::cout << " Spin    N/S/V/M : " << _SpinN   <<" "<<_SpinScalar   <<"/"<<_SpinVector   <<"/"<<_SpinMatrix<<std::endl;
   //   std::cout << " Colour  N/S/V/M : " << _ColourN <<" "<<_ColourScalar <<"/"<<_ColourVector <<"/"<<_ColourMatrix<<std::endl;
   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
   typesize = sizeof(typename vobj::scalar_type);
   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
   else                 typesize*= _ColourN;
   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
   else                 typesize*= _SpinN;
   colors    = _ColourN;
   spins     = _SpinN;
   datacount = _LorentzN;
   return stream.str();
 }
 template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
 };
 ////////////////////////////////////////////////////////////
 // Helper to fill out metadata
 ////////////////////////////////////////////////////////////
 template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
 					  FieldMetaData &header,
 					  scidacRecord & _scidacRecord,
 					  scidacFile   & _scidacFile) 
 {
   typedef typename getPrecision<vobj>::real_scalar_type stype;
   /////////////////////////////////////
   // Pull Grid's metadata
   /////////////////////////////////////
   PrepareMetaData(field,header);
   /////////////////////////////////////
   // Scidac Private File structure
   /////////////////////////////////////
   _scidacFile              = scidacFile(field._grid);
   /////////////////////////////////////
   // Scidac Private Record structure
   /////////////////////////////////////
   scidacRecord sr;
   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
   sr.date       = header.creation_date;
   sr.precision  = ScidacWordMnemonic<stype>();
   sr.recordtype = GRID_IO_FIELD;
   _scidacRecord = sr;
   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
 }
 ///////////////////////////////////////////////////////
 // Scidac checksum
 ///////////////////////////////////////////////////////
 static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
 {
   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
   if ( scidac_csuma !=scidac_checksuma) return 0;
   if ( scidac_csumb !=scidac_checksumb) return 0;
    return 1;
 }
 ////////////////////////////////////////////////////////////////////////////////////
 // Lime, ILDG and Scidac I/O classes
 ////////////////////////////////////////////////////////////////////////////////////
 class GridLimeReader : public BinaryIO {
 public:
-  ILDGIO(std::string file, ILDGstate RW) {
+   ///////////////////////////////////////////////////
-      filename = file;
+   // FIXME: format for RNG? Now just binary out instead
-    if (RW == ILDGwrite){
+   ///////////////////////////////////////////////////
      File = fopen(file.c_str(), "w");
      // check if opened correctly
-      LimeW = limeCreateWriter(File);
+   FILE       *File;
-    } else {
+   LimeReader *LimeR;
-      File = fopen(file.c_str(), "r");
+   std::string filename;
      // check if opened correctly
-      LimeR = limeCreateReader(File);
+   /////////////////////////////////////////////
   // Open the file
   /////////////////////////////////////////////
   void open(std::string &_filename) 
   {
     filename= _filename;
     File = fopen(filename.c_str(), "r");
     LimeR = limeCreateReader(File);
   }
   /////////////////////////////////////////////
   // Close the file
   /////////////////////////////////////////////
   void close(void){
     fclose(File);
     //     limeDestroyReader(LimeR);
   }
  ////////////////////////////////////////////
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    std::string format = getFormatString<vobj>();
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      std::cout << GridLogMessage << limeReaderType(LimeR) <<std::endl;
      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	off_t offset= ftell(File);
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
 	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name);
 	/////////////////////////////////////////////
 	// Verify checksums
 	/////////////////////////////////////////////
 	scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
 	return;
      }
    }
  }
  ////////////////////////////////////////////
  // Read a generic serialisable object
  ////////////////////////////////////////////
  template<class serialisable_object>
  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
  {
    std::string xmlstring;
    // should this be a do while; can we miss a first record??
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
-  ~ILDGIO() { fclose(File); }
+      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
-  int createHeader(std::string message, int MB, int ME, size_t PayloadSize, LimeWriter* L){
+      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	XmlReader RD(&xmlc[0],"");
 	read(RD,object_name,object);
 	return;
      }
    }  
    assert(0);
  }
 };
 class GridLimeWriter : public BinaryIO {
 public:
   ///////////////////////////////////////////////////
   // FIXME: format for RNG? Now just binary out instead
   ///////////////////////////////////////////////////
   FILE       *File;
   LimeWriter *LimeW;
   std::string filename;
   void open(std::string &_filename) { 
     filename= _filename;
     File = fopen(filename.c_str(), "w");
     LimeW = limeCreateWriter(File); assert(LimeW != NULL );
   }
   /////////////////////////////////////////////
   // Close the file
   /////////////////////////////////////////////
   void close(void) {
     fclose(File);
     //  limeDestroyWriter(LimeW);
   }
  ///////////////////////////////////////////////////////
  // Lime utility functions
  ///////////////////////////////////////////////////////
  int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
  {
    LimeRecordHeader *h;
    h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
-    int status = limeWriteRecordHeader(h, L);
+    assert(limeWriteRecordHeader(h, LimeW) >= 0);
    if (status < 0) {
      std::cerr << "ILDG Header error\n";
      return status;
    }
    limeDestroyHeader(h);
    return LIME_SUCCESS;
  }
  ////////////////////////////////////////////
  // Write a generic serialisable object
  ////////////////////////////////////////////
  template<class serialisable_object>
  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
  {
    std::string xmlstring;
    {
      XmlWriter WR("","");
      write(WR,object_name,object);
      xmlstring = WR.XmlString();
    }
    uint64_t nbytes = xmlstring.size();
    int err;
    LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL);
-  unsigned int writeHeader(ILDGField &header) {
+    err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
-    // write header in LIME
+    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
-    n_uint64_t nbytes;
+    err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
-    int MB_flag = 1, ME_flag = 0;
+    limeDestroyHeader(h);
  }
  ////////////////////////////////////////////
  // Write a generic lattice field and csum
  ////////////////////////////////////////////
  template<class vobj>
  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
  {
    ////////////////////////////////////////////
    // Create record header
    ////////////////////////////////////////////
    typedef typename vobj::scalar_object sobj;
    int err;
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
    createLimeRecordHeader(record_name, 0, 0, PayloadSize);
-    char message[] = "ildg-format";
+    ////////////////////////////////////////////////////////////////////
-    nbytes = strlen(message);
+    // NB: FILE and iostream are jointly writing disjoint sequences in the
-    LimeHeader = limeCreateHeader(MB_flag, ME_flag, message, nbytes);
+    // the same file through different file handles (integer units).
-    limeWriteRecordHeader(LimeHeader, LimeW);
+    // 
-    limeDestroyHeader(LimeHeader);
+    // These are both buffered, so why I think this code is right is as follows.
-    // save the xml header here
+    //
-    // use the xml_writer to c++ streams in pugixml
+    // i)  write record header to FILE *File, telegraphing the size. 
-    // and convert to char message
+    // ii) ftell reads the offset from FILE *File .
-    limeWriteRecordData(message, &nbytes, LimeW);
+    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
-    limeWriterCloseRecord(LimeW);
+    //      Closes iostream and flushes.
    // iv) fseek on FILE * to end of this disjoint section.
    //  v) Continue writing scidac record.
    ////////////////////////////////////////////////////////////////////
    off_t offset = ftell(File);
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
    ////////////////////////////////////////
    // Write checksum element, propagaing forward from the BinaryIO
    // Always pair a checksum with a binary object, and close message
    ////////////////////////////////////////
    scidacChecksum checksum;
    std::stringstream streama; streama << std::hex << scidac_csuma;
    std::stringstream streamb; streamb << std::hex << scidac_csumb;
    checksum.suma= streama.str();
    checksum.sumb= streamb.str();
    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM));
  }
 };
-    return 0;
+class ScidacWriter : public GridLimeWriter {
-  }
+ public:
-
+
-  unsigned int readHeader(ILDGField &header) {
+   template<class SerialisableUserFile>
-    return 0;
+   void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
   {
     scidacFile    _scidacFile(grid);
     writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
     writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
   }
  ////////////////////////////////////////////////
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
   template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord) 
  {
    typedef typename vobj::scalar_object sobj;
    uint64_t nbytes;
    GridBase * grid = field._grid;
    ////////////////////////////////////////
    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
    ScidacMetaData(field,header,_scidacRecord,_scidacFile);
    //////////////////////////////////////////////
    // Fill the Lime file record by record
    //////////////////////////////////////////////
    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
  }
 };
 class IldgWriter : public ScidacWriter {
 public:
  ///////////////////////////////////
  // A little helper
  ///////////////////////////////////
  void writeLimeIldgLFN(std::string &LFN)
  {
    uint64_t PayloadSize = LFN.size();
    int err;
    createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
    err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
    err=limeWriterCloseRecord(LimeW); assert(err>=0);
  }
  ////////////////////////////////////////////////////////////////
  // Special ILDG operations ; gauge configs only.
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  ////////////////////////////////////////////////////////////////
  template <class vsimd>
-  uint32_t readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu) {
+  void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) 
-    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
+  {
-    typedef LorentzColourMatrixD sobjd;
+    GridBase * grid = Umu._grid;
    typedef LorentzColourMatrixF sobjf;
    typedef iLorentzColourMatrix<vsimd> itype;
    typedef LorentzColourMatrix sobj;
    GridBase *grid = Umu._grid;
    ILDGField header;
    readHeader(header);
    // now just the conf, ignore the header
    std::string format = std::string("IEEE64BIG");
    do {limeReaderNextRecord(LimeR);}
    while (strncmp(limeReaderType(LimeR), "ildg-binary-data",16));
    n_uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
    ILDGtype ILDGt(true, LimeR);
    // this is special for double prec data, just for the moment
    uint32_t csum = BinaryIO::readObjectParallel< itype, sobjd >(
       Umu, filename, ILDGMunger<sobjd, sobj>(), 0, format, ILDGt);
    // Check configuration 
    // todo
    return csum;
  }
  template <class vsimd>
  uint32_t writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, std::string format) {
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef iLorentzColourMatrix<vsimd> vobj;
    typedef typename vobj::scalar_object sobj;
    typedef LorentzColourMatrixD fobj;
-    ILDGField header;
+    uint64_t nbytes;
    // fill the header
    header.floating_point = format;
-    ILDGUnmunger<fobj, sobj> munge;
+    ////////////////////////////////////////
-    unsigned int offset = writeHeader(header);
+    // fill the Grid header
    ////////////////////////////////////////
    FieldMetaData header;
    scidacRecord  _scidacRecord;
    scidacFile    _scidacFile;
-    BinaryIO::Uint32Checksum<vobj, fobj>(Umu, munge, header.checksum);
+    ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
-    // Write data record header
+    std::string format = header.floating_point;
-    n_uint64_t PayloadSize = sizeof(fobj) * Umu._grid->_gsites;
+    header.ensemble_id    = description;
-    createHeader("ildg-binary-data", 0, 1, PayloadSize, LimeW);
+    header.ensemble_label = description;
    header.sequence_number = sequence;
    header.ildg_lfn = LFN;
-    ILDGtype ILDGt(true, LimeW);
+    assert ( (format == std::string("IEEE32BIG"))  
-    uint32_t csum = BinaryIO::writeObjectParallel<vobj, fobj>(
+           ||(format == std::string("IEEE64BIG")) );
       Umu, filename, munge, 0, header.floating_point, ILDGt);
-    limeWriterCloseRecord(LimeW);
+    //////////////////////////////////////////////////////
    // Fill ILDG header data struct
    //////////////////////////////////////////////////////
    ildgFormat ildgfmt ;
    ildgfmt.field     = std::string("su3gauge");
-    // Last record
+    if ( format == std::string("IEEE32BIG") ) { 
-    // the logical file name LNF
+      ildgfmt.precision = 32;
-    // look into documentation on how to generate this string
+    } else { 
-    std::string LNF = "empty"; 
+      ildgfmt.precision = 64;
    }
    ildgfmt.version = 1.0;
    ildgfmt.lx = header.dimension[0];
    ildgfmt.ly = header.dimension[1];
    ildgfmt.lz = header.dimension[2];
    ildgfmt.lt = header.dimension[3];
    assert(header.nd==4);
    assert(header.nd==header.dimension.size());
    //////////////////////////////////////////////////////////////////////////////
    // Fill the USQCD info field
    //////////////////////////////////////////////////////////////////////////////
    usqcdInfo info;
    info.version=1.0;
    info.plaq   = header.plaquette;
    info.linktr = header.link_trace;
-    PayloadSize = sizeof(LNF);
+    std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
-    createHeader("ildg-binary-lfn", 1 , 1, PayloadSize, LimeW);
+    //////////////////////////////////////////////
-    limeWriteRecordData(const_cast<char*>(LNF.c_str()), &PayloadSize, LimeW);
+    // Fill the Lime file record by record
-
+    //////////////////////////////////////////////
-    limeWriterCloseRecord(LimeW);
+    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
-
+    writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
-    return csum;
+    writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
    writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    writeLimeObject(0,0,ildgfmt,std::string("ildgFormat")   ,std::string(ILDG_FORMAT)); // rec
    writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
    writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
    //    limeDestroyWriter(LimeW);
    fclose(File);
  }
  // format for RNG? Now just binary out
 };
-}
+
-}
+class IldgReader : public GridLimeReader {
 public:
  ////////////////////////////////////////////////////////////////
  // Read either Grid/SciDAC/ILDG configuration
  // Don't require scidac records EXCEPT checksum
  // Use Grid MetaData object if present.
  // Else use ILDG MetaData object if present.
  // Else use SciDAC MetaData object if present.
  ////////////////////////////////////////////////////////////////
  template <class vsimd>
  void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
    typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
    typedef typename GaugeField::vector_object  vobj;
    typedef typename vobj::scalar_object sobj;
    typedef LorentzColourMatrixF fobj;
    typedef LorentzColourMatrixD dobj;
    GridBase *grid = Umu._grid;
    std::vector<int> dims = Umu._grid->FullDimensions();
    assert(dims.size()==4);
    // Metadata holders
    ildgFormat     ildgFormat_    ;
    std::string    ildgLFN_       ;
    scidacChecksum scidacChecksum_; 
    usqcdInfo      usqcdInfo_     ;
    // track what we read from file
    int found_ildgFormat    =0;
    int found_ildgLFN       =0;
    int found_scidacChecksum=0;
    int found_usqcdInfo     =0;
    int found_ildgBinary =0;
    int found_FieldMetaData =0;
    uint32_t nersc_csum;
    uint32_t scidac_csuma;
    uint32_t scidac_csumb;
    // Binary format
    std::string format;
    //////////////////////////////////////////////////////////////////////////
    // Loop over all records
    // -- Order is poorly guaranteed except ILDG header preceeds binary section.
    // -- Run like an event loop.
    // -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
    //    that Scidac. 
    // -- Insist on Scidac checksum record.
    //////////////////////////////////////////////////////////////////////////
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
      //////////////////////////////////////////////////////////////////
      // If not BINARY_DATA read a string and parse
      //////////////////////////////////////////////////////////////////
      if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) )  ) {
 	// Copy out the string
 	std::vector<char> xmlc(nbytes+1,'\0');
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
 	//////////////////////////////////
 	// ILDG format record
 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
 	  XmlReader RD(&xmlc[0],"");
 	  read(RD,"ildgFormat",ildgFormat_);
 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
 	  if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
 	  assert( ildgFormat_.lx == dims[0]);
 	  assert( ildgFormat_.ly == dims[1]);
 	  assert( ildgFormat_.lz == dims[2]);
 	  assert( ildgFormat_.lt == dims[3]);
 	  found_ildgFormat = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
 	  FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
 	  found_ildgLFN = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
 	  XmlReader RD(&xmlc[0],"");
 	  read(RD,"FieldMetaData",FieldMetaData_);
 	  format = FieldMetaData_.floating_point;
 	  assert(FieldMetaData_.dimension[0] == dims[0]);
 	  assert(FieldMetaData_.dimension[1] == dims[1]);
 	  assert(FieldMetaData_.dimension[2] == dims[2]);
 	  assert(FieldMetaData_.dimension[3] == dims[3]);
 	  found_FieldMetaData = 1;
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
 	  std::string xmls(&xmlc[0]);
 	  // is it a USQCD info field
 	  if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) { 
 	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
 	    XmlReader RD(&xmlc[0],"");
 	    read(RD,"usqcdInfo",usqcdInfo_);
 	    found_usqcdInfo = 1;
 	  }
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
 	  XmlReader RD(&xmlc[0],"");
 	  read(RD,"scidacChecksum",scidacChecksum_);
 	  found_scidacChecksum = 1;
 	}
      } else {  
 	/////////////////////////////////
 	// Binary data
 	/////////////////////////////////
 	std::cout << GridLogMessage << "ILDG Binary record found : "  ILDG_BINARY_DATA << std::endl;
 	off_t offset= ftell(File);
 	if ( format == std::string("IEEE64BIG") ) {
 	  GaugeSimpleMunger<dobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	} else { 
 	  GaugeSimpleMunger<fobj, sobj> munge;
 	  BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	found_ildgBinary = 1;
      }
    }
    //////////////////////////////////////////////////////
    // Minimally must find binary segment and checksum
    // Since this is an ILDG reader require ILDG format
    //////////////////////////////////////////////////////
    assert(found_ildgBinary);
    assert(found_ildgFormat);
    assert(found_scidacChecksum);
    // Must find something with the lattice dimensions
    assert(found_FieldMetaData||found_ildgFormat);
    if ( found_FieldMetaData ) {
      std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
    } else { 
      assert(found_ildgFormat);
      assert ( ildgFormat_.field == std::string("su3gauge") );
      ///////////////////////////////////////////////////////////////////////////////////////
      // Populate our Grid metadata as best we can
      ///////////////////////////////////////////////////////////////////////////////////////
      std::ostringstream vers; vers << ildgFormat_.version;
      FieldMetaData_.hdr_version = vers.str();
      FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
      FieldMetaData_.nd=4;
      FieldMetaData_.dimension.resize(4);
      FieldMetaData_.dimension[0] = ildgFormat_.lx ;
      FieldMetaData_.dimension[1] = ildgFormat_.ly ;
      FieldMetaData_.dimension[2] = ildgFormat_.lz ;
      FieldMetaData_.dimension[3] = ildgFormat_.lt ;
      if ( found_usqcdInfo ) { 
 	FieldMetaData_.plaquette = usqcdInfo_.plaq;
 	FieldMetaData_.link_trace= usqcdInfo_.linktr;
 	std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
 	std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
 	std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
      } else { 
 	FieldMetaData_.plaquette = 0.0;
 	FieldMetaData_.link_trace= 0.0;
 	std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
      }
    }
    ////////////////////////////////////////////////////////////
    // Really really want to mandate a scidac checksum
    ////////////////////////////////////////////////////////////
    if ( found_scidacChecksum ) {
      FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
      FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
      scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
      assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
      assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
      std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
    } else { 
      std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
      assert(0); // Can I insist always checksum ?
    }
    if ( found_FieldMetaData || found_usqcdInfo ) {
      FieldMetaData checker;
      GaugeStatistics(Umu,checker);
      assert(fabs(checker.plaquette  - FieldMetaData_.plaquette )<1.0e-5);
      assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
      std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
    }
  }
 };
 }}
 //HAVE_LIME
 #endif
--- a/lib/parallelIO/IldgIOtypes.h
+++ b/lib/parallelIO/IldgIOtypes.h
@ -34,47 +34,198 @@ extern "C" { // for linkage
 namespace Grid {
-struct ILDGtype {
+/////////////////////////////////////////////////////////////////////////////////
-  bool is_ILDG;
+// Data representation of records that enter ILDG and SciDac formats
-  LimeWriter* LW;
+/////////////////////////////////////////////////////////////////////////////////
  LimeReader* LR;
-  ILDGtype(bool is, LimeWriter* L) : is_ILDG(is), LW(L), LR(NULL) {}
+#define GRID_FORMAT      "grid-format"
-  ILDGtype(bool is, LimeReader* L) : is_ILDG(is), LW(NULL), LR(L) {}
+#define ILDG_FORMAT      "ildg-format"
-  ILDGtype() : is_ILDG(false), LW(NULL), LR(NULL) {}
+#define ILDG_BINARY_DATA "ildg-binary-data"
-};
+#define ILDG_DATA_LFN    "ildg-data-lfn"
 #define SCIDAC_CHECKSUM           "scidac-checksum"
 #define SCIDAC_PRIVATE_FILE_XML   "scidac-private-file-xml"
 #define SCIDAC_FILE_XML           "scidac-file-xml"
 #define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
 #define SCIDAC_RECORD_XML         "scidac-record-xml"
 #define SCIDAC_BINARY_DATA        "scidac-binary-data"
 // Unused SCIDAC records names; could move to support this functionality
 #define SCIDAC_SITELIST           "scidac-sitelist"
-class ILDGField {
+  ////////////////////////////////////////////////////////////
  const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
  const int GRID_IO_MULTIFILE  = 1; // hardcode lift from QIO compat
  const int GRID_IO_FIELD      = 0; // hardcode lift from QIO compat
  const int GRID_IO_GLOBAL     = 1; // hardcode lift from QIO compat
  ////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////
 // QIO uses mandatory "private" records fixed format
 // Private is in principle "opaque" however it can't be changed now because that would break existing 
 // file compatability, so should be correct to assume the undocumented but defacto file structure.
 /////////////////////////////////////////////////////////////////////////////////
 ////////////////////////
 // Scidac private file xml
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
 ////////////////////////
 struct scidacFile : Serializable {
 public:
-  // header strings (not in order)
+  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
-  std::vector<int> dimension;
+                                  double, version,
-  std::vector<std::string> boundary;
+                                  int, spacetime,
-  int data_start;
+				  std::string, dims, // must convert to int
-  std::string hdr_version;
+                                  int, volfmt);
  std::string storage_format;
  // Checks on data
  double link_trace;
  double plaquette;
  uint32_t checksum;
  unsigned int sequence_number;
  std::string data_type;
  std::string ensemble_id;
  std::string ensemble_label;
  std::string creator;
  std::string creator_hardware;
  std::string creation_date;
  std::string archive_date;
  std::string floating_point;
 };
 }
 #else
 namespace Grid {
-struct ILDGtype {
+  std::vector<int> getDimensions(void) { 
-  bool is_ILDG;
+    std::stringstream stream(dims);
-  ILDGtype() : is_ILDG(false) {}
+    std::vector<int> dimensions;
-};
+    int n;
-}
+    while(stream >> n){
      dimensions.push_back(n);
    }
    return dimensions;
  }
  void setDimensions(std::vector<int> dimensions) { 
    char delimiter = ' ';
    std::stringstream stream;
    for(int i=0;i<dimensions.size();i++){ 
      stream << dimensions[i];
      if ( i != dimensions.size()-1) { 
 	stream << delimiter <<std::endl;
      }
    }
    dims = stream.str();
  }
  // Constructor provides Grid
  scidacFile() =default; // default constructor
  scidacFile(GridBase * grid){
    version      = 1.0;
    spacetime    = grid->_ndimension;
    setDimensions(grid->FullDimensions()); 
    volfmt       = GRID_IO_SINGLEFILE;
  }
 };
 ///////////////////////////////////////////////////////////////////////
 // scidac-private-record-xml : example
 // <scidacRecord>
 // <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
 // <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
 // <typesize>144</typesize><datacount>4</datacount>
 // </scidacRecord>
 ///////////////////////////////////////////////////////////////////////
 struct scidacRecord : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
                                  double, version,
                                  std::string, date,
 				  int, recordtype,
 				  std::string, datatype,
 				  std::string, precision,
 				  int, colors,
 				  int, spins,
 				  int, typesize,
 				  int, datacount);
  scidacRecord() { version =1.0; }
 };
 ////////////////////////
 // ILDG format
 ////////////////////////
 struct ildgFormat : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
 				  double, version,
 				  std::string, field,
 				  int, precision,
 				  int, lx,
 				  int, ly,
 				  int, lz,
 				  int, lt);
  ildgFormat() { version=1.0; };
 };
 ////////////////////////
 // USQCD info
 ////////////////////////
 struct usqcdInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
 				  double, version,
 				  double, plaq,
 				  double, linktr,
 				  std::string, info);
  usqcdInfo() { 
    version=1.0; 
  };
 };
 ////////////////////////
 // Scidac Checksum
 ////////////////////////
 struct scidacChecksum : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
 				  double, version,
 				  std::string, suma,
 				  std::string, sumb);
  scidacChecksum() { 
    version=1.0; 
  };
 };
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Type:           scidac-file-xml         <title>MILC ILDG archival gauge configuration</title>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Type:           
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////
 // Scidac private file xml 
 // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> 
 ////////////////////////                                                                                                                                                                              
 #if 0
 ////////////////////////////////////////////////////////////////////////////////////////
 // From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
 ////////////////////////////////////////////////////////////////////////////////////////
 struct usqcdPropFile : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
 				  double, version,
 				  std::string, type,
 				  std::string, info);
  usqcdPropFile() { 
    version=1.0; 
  };
 };
 struct usqcdSourceInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
 				  double, version,
 				  std::string, info);
  usqcdSourceInfo() { 
    version=1.0; 
  };
 };
 struct usqcdPropInfo : Serializable { 
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
 				  double, version,
 				  int, spin,
 				  int, color,
 				  std::string, info);
  usqcdPropInfo() { 
    version=1.0; 
  };
 };
 #endif
 }
 #endif
 #endif
--- a/lib/parallelIO/MetaData.h
+++ b/lib/parallelIO/MetaData.h
@ -0,0 +1,325 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/parallelIO/NerscIO.h
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <algorithm>
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <map>
 #include <unistd.h>
 #include <sys/utsname.h>
 #include <pwd.h>
 namespace Grid {
  ///////////////////////////////////////////////////////
  // Precision mapping
  ///////////////////////////////////////////////////////
  template<class vobj> static std::string getFormatString (void)
  {
    std::string format;
    typedef typename getPrecision<vobj>::real_scalar_type stype;
    if ( sizeof(stype) == sizeof(float) ) {
      format = std::string("IEEE32BIG");
    }
    if ( sizeof(stype) == sizeof(double) ) {
      format = std::string("IEEE64BIG");
    }
    return format;
  }
  ////////////////////////////////////////////////////////////////////////////////
  // header specification/interpretation
  ////////////////////////////////////////////////////////////////////////////////
    class FieldMetaData : Serializable {
    public:
      GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
 				      int, nd,
 				      std::vector<int>, dimension,
 				      std::vector<std::string>, boundary,
 				      int, data_start,
 				      std::string, hdr_version,
 				      std::string, storage_format,
 				      double, link_trace,
 				      double, plaquette,
 				      uint32_t, checksum,
 				      uint32_t, scidac_checksuma,
 				      uint32_t, scidac_checksumb,
 				      unsigned int, sequence_number,
 				      std::string, data_type,
 				      std::string, ensemble_id,
 				      std::string, ensemble_label,
 				      std::string, ildg_lfn,
 				      std::string, creator,
 				      std::string, creator_hardware,
 				      std::string, creation_date,
 				      std::string, archive_date,
 				      std::string, floating_point);
      FieldMetaData(void) { 
 	nd=4;
 	dimension.resize(4);
 	boundary.resize(4);
      }
    };
  namespace QCD {
    using namespace Grid;
    //////////////////////////////////////////////////////////////////////
    // Bit and Physical Checksumming and QA of data
    //////////////////////////////////////////////////////////////////////
    inline void GridMetaData(GridBase *grid,FieldMetaData &header)
    {
      int nd = grid->_ndimension;
      header.nd = nd;
      header.dimension.resize(nd);
      header.boundary.resize(nd);
      for(int d=0;d<nd;d++) {
 	header.dimension[d] = grid->_fdimensions[d];
      }
      for(int d=0;d<nd;d++) {
 	header.boundary[d] = std::string("PERIODIC");
      }
    }
    inline void MachineCharacteristics(FieldMetaData &header)
    {
      // Who
      struct passwd *pw = getpwuid (getuid());
      if (pw) header.creator = std::string(pw->pw_name); 
      // When
      std::time_t t = std::time(nullptr);
      std::tm tm_ = *std::localtime(&t);
      std::ostringstream oss; 
      //      oss << std::put_time(&tm_, "%c %Z");
      header.creation_date = oss.str();
      header.archive_date  = header.creation_date;
      // What
      struct utsname name;  uname(&name);
      header.creator_hardware = std::string(name.nodename)+"-";
      header.creator_hardware+= std::string(name.machine)+"-";
      header.creator_hardware+= std::string(name.sysname)+"-";
      header.creator_hardware+= std::string(name.release);
    }
 #define dump_meta_data(field, s)					\
      s << "BEGIN_HEADER"      << std::endl;				\
      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
      s << "DATATYPE = "       << field.data_type      << std::endl;	\
      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
      for(int i=0;i<4;i++){						\
 	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
      }									\
      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
      for(int i=0;i<4;i++){						\
 	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
      }									\
 									\
      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
      s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
      s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
      s << "CREATOR = "         << field.creator          << std::endl;	\
      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
      s << "END_HEADER"         << std::endl;
 template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
 {
  GridBase *grid = field._grid;
  std::string format = getFormatString<vobj>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   MachineCharacteristics(header);
 }
 inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
 {
   // How to convert data precision etc...
   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
 }
 inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
 {
   // How to convert data precision etc...
   header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
   header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
 {
   GridBase *grid = field._grid;
   std::string format = getFormatString<vLorentzColourMatrixF>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
 template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
 {
   GridBase *grid = field._grid;
   std::string format = getFormatString<vLorentzColourMatrixD>();
   header.floating_point = format;
   header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
   GridMetaData(grid,header); 
   GaugeStatistics(field,header);
   MachineCharacteristics(header);
 }
    //////////////////////////////////////////////////////////////////////
    // Utilities ; these are QCD aware
    //////////////////////////////////////////////////////////////////////
    inline void reconstruct3(LorentzColourMatrix & cm)
    {
      const int x=0;
      const int y=1;
      const int z=2;
      for(int mu=0;mu<Nd;mu++){
 	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
 	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
 	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
      }
    }
    ////////////////////////////////////////////////////////////////////////////////
    // Some data types for intermediate storage
    ////////////////////////////////////////////////////////////////////////////////
    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
 /////////////////////////////////////////////////////////////////////////////////
 // Simple classes for precision conversion
 /////////////////////////////////////////////////////////////////////////////////
 template <class fobj, class sobj>
 struct BinarySimpleUnmunger {
  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
  void operator()(sobj &in, fobj &out) {
    // take word by word and transform accoding to the status
    fobj_stype *out_buffer = (fobj_stype *)&out;
    sobj_stype *in_buffer = (sobj_stype *)&in;
    size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
    size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
    assert(fobj_words == sobj_words);
    for (unsigned int word = 0; word < sobj_words; word++)
      out_buffer[word] = in_buffer[word];  // type conversion on the fly
  }
 };
 template <class fobj, class sobj>
 struct BinarySimpleMunger {
  typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
  typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
  void operator()(fobj &in, sobj &out) {
    // take word by word and transform accoding to the status
    fobj_stype *in_buffer = (fobj_stype *)&in;
    sobj_stype *out_buffer = (sobj_stype *)&out;
    size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
    size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
    assert(fobj_words == sobj_words);
    for (unsigned int word = 0; word < sobj_words; word++)
      out_buffer[word] = in_buffer[word];  // type conversion on the fly
  }
 };
    template<class fobj,class sobj>
    struct GaugeSimpleMunger{
      void operator()(fobj &in, sobj &out) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
          for (int j = 0; j < Nc; j++) {
 	    out(mu)()(i, j) = in(mu)()(i, j);
 	  }}
        }
      };
    };
    template <class fobj, class sobj>
    struct GaugeSimpleUnmunger {
      void operator()(sobj &in, fobj &out) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
          for (int j = 0; j < Nc; j++) {
 	    out(mu)()(i, j) = in(mu)()(i, j);
 	  }}
        }
      };
    };
    template<class fobj,class sobj>
    struct Gauge3x2munger{
      void operator() (fobj &in,sobj &out){
 	for(int mu=0;mu<Nd;mu++){
 	  for(int i=0;i<2;i++){
 	  for(int j=0;j<3;j++){
 	    out(mu)()(i,j) = in(mu)(i)(j);
 	  }}
 	}
 	reconstruct3(out);
      }
    };
    template<class fobj,class sobj>
    struct Gauge3x2unmunger{
      void operator() (sobj &in,fobj &out){
 	for(int mu=0;mu<Nd;mu++){
 	  for(int i=0;i<2;i++){
 	  for(int j=0;j<3;j++){
 	    out(mu)(i)(j) = in(mu)()(i,j);
 	  }}
 	}
      }
    };
  }
 }
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@ -30,182 +30,11 @@
 #ifndef GRID_NERSC_IO_H
 #define GRID_NERSC_IO_H
 #include <algorithm>
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <map>
 #include <unistd.h>
 #include <sys/utsname.h>
 #include <pwd.h>
 namespace Grid {
  namespace QCD {
    using namespace Grid;
    ////////////////////////////////////////////////////////////////////////////////
    // Some data types for intermediate storage
    ////////////////////////////////////////////////////////////////////////////////
    template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, 4 >;
    typedef iLorentzColour2x3<Complex>  LorentzColour2x3;
    typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
    typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
    ////////////////////////////////////////////////////////////////////////////////
    // header specification/interpretation
    ////////////////////////////////////////////////////////////////////////////////
    class NerscField {
    public:
      // header strings (not in order)
      int dimension[4];
      std::string boundary[4]; 
      int data_start;
      std::string hdr_version;
      std::string storage_format;
      // Checks on data
      double link_trace;
      double plaquette;
      uint32_t checksum;
      unsigned int sequence_number;
      std::string data_type;
      std::string ensemble_id ;
      std::string ensemble_label ;
      std::string creator ;
      std::string creator_hardware ;
      std::string creation_date ;
      std::string archive_date ;
      std::string floating_point;
    };
    //////////////////////////////////////////////////////////////////////
    // Bit and Physical Checksumming and QA of data
    //////////////////////////////////////////////////////////////////////
    inline void NerscGrid(GridBase *grid,NerscField &header)
    {
      assert(grid->_ndimension==4);
      for(int d=0;d<4;d++) {
 	header.dimension[d] = grid->_fdimensions[d];
      }
      for(int d=0;d<4;d++) {
 	header.boundary[d] = std::string("PERIODIC");
      }
    }
    template<class GaugeField>
    inline void NerscStatistics(GaugeField & data,NerscField &header)
    {
      // How to convert data precision etc...
      header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplR>::linkTrace(data);
      header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplR>::avgPlaquette(data);
    }
    inline void NerscMachineCharacteristics(NerscField &header)
    {
      // Who
      struct passwd *pw = getpwuid (getuid());
      if (pw) header.creator = std::string(pw->pw_name); 
      // When
      std::time_t t = std::time(nullptr);
      std::tm tm = *std::localtime(&t);
      std::ostringstream oss; 
      //  oss << std::put_time(&tm, "%c %Z");
      header.creation_date = oss.str();
      header.archive_date  = header.creation_date;
      // What
      struct utsname name;  uname(&name);
      header.creator_hardware = std::string(name.nodename)+"-";
      header.creator_hardware+= std::string(name.machine)+"-";
      header.creator_hardware+= std::string(name.sysname)+"-";
      header.creator_hardware+= std::string(name.release);
    }
    //////////////////////////////////////////////////////////////////////
    // Utilities ; these are QCD aware
    //////////////////////////////////////////////////////////////////////
    inline void NerscChecksum(uint32_t *buf,uint32_t buf_size_bytes,uint32_t &csum)
    {
      BinaryIO::Uint32Checksum(buf,buf_size_bytes,csum);
    }
    inline void reconstruct3(LorentzColourMatrix & cm)
    {
      const int x=0;
      const int y=1;
      const int z=2;
      for(int mu=0;mu<4;mu++){
 	cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
 	cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
 	cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
      }
    }
    template<class fobj,class sobj>
    struct NerscSimpleMunger{
      void operator()(fobj &in, sobj &out, uint32_t &csum) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
            for (int j = 0; j < Nc; j++) {
              out(mu)()(i, j) = in(mu)()(i, j);
            }
          }
        }
        NerscChecksum((uint32_t *)&in, sizeof(in), csum);
      };
    };
    template <class fobj, class sobj>
    struct NerscSimpleUnmunger {
      void operator()(sobj &in, fobj &out, uint32_t &csum) {
        for (int mu = 0; mu < Nd; mu++) {
          for (int i = 0; i < Nc; i++) {
            for (int j = 0; j < Nc; j++) {
              out(mu)()(i, j) = in(mu)()(i, j);
            }
          }
        }
        NerscChecksum((uint32_t *)&out, sizeof(out), csum);
      };
    };
    template<class fobj,class sobj>
    struct Nersc3x2munger{
      void operator() (fobj &in,sobj &out,uint32_t &csum){
 	NerscChecksum((uint32_t *)&in,sizeof(in),csum); 
 	for(int mu=0;mu<4;mu++){
 	  for(int i=0;i<2;i++){
 	    for(int j=0;j<3;j++){
 	      out(mu)()(i,j) = in(mu)(i)(j);
 	    }}
 	}
 	reconstruct3(out);
      }
    };
    template<class fobj,class sobj>
    struct Nersc3x2unmunger{
      void operator() (sobj &in,fobj &out,uint32_t &csum){
 	for(int mu=0;mu<4;mu++){
 	  for(int i=0;i<2;i++){
 	    for(int j=0;j<3;j++){
 	      out(mu)(i)(j) = in(mu)()(i,j);
 	    }}
 	}
 	NerscChecksum((uint32_t *)&out,sizeof(out),csum); 
      }
    };
    ////////////////////////////////////////////////////////////////////////////////
    // Write and read from fstream; comput header offset for payload
    ////////////////////////////////////////////////////////////////////////////////
@ -216,42 +45,17 @@ namespace Grid {
 	std::ofstream fout(file,std::ios::out);
      }
-#define dump_nersc_header(field, s)					\
+      static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
      s << "BEGIN_HEADER"      << std::endl;				\
      s << "HDR_VERSION = "    << field.hdr_version    << std::endl;	\
      s << "DATATYPE = "       << field.data_type      << std::endl;	\
      s << "STORAGE_FORMAT = " << field.storage_format << std::endl;	\
      for(int i=0;i<4;i++){						\
 	s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
      }									\
      s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
      s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl; \
      for(int i=0;i<4;i++){						\
 	s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;	\
      }									\
 									\
      s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
      s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;	\
      s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;	\
      s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;	\
      s << "CREATOR = "         << field.creator          << std::endl;	\
      s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;	\
      s << "CREATION_DATE = "   << field.creation_date    << std::endl;	\
      s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;	\
      s << "FLOATING_POINT = "  << field.floating_point   << std::endl;	\
      s << "END_HEADER"         << std::endl;
      static inline unsigned int writeHeader(NerscField &field,std::string file)
      {
      std::ofstream fout(file,std::ios::out|std::ios::in);
      fout.seekp(0,std::ios::beg);
-      dump_nersc_header(field, fout);
+      dump_meta_data(field, fout);
      field.data_start = fout.tellp();
      return field.data_start;
    }
      // for the header-reader
-      static inline int readHeader(std::string file,GridBase *grid,  NerscField &field)
+      static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
      {
      int offset=0;
      std::map<std::string,std::string> header;
@ -323,21 +127,21 @@ namespace Grid {
      return field.data_start;
    }
-      /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // Now the meat: the object readers
+    // Now the meat: the object readers
-      /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #define PARALLEL_READ
 #define PARALLEL_WRITE
-      template<class vsimd>
+    template<class vsimd>
-      static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file)
+    static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
-      {
+					 FieldMetaData& header,
 					 std::string file)
    {
      typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
      GridBase *grid = Umu._grid;
      int offset = readHeader(file,Umu._grid,header);
-      NerscField clone(header);
+      FieldMetaData clone(header);
      std::string format(header.floating_point);
@ -346,76 +150,78 @@ namespace Grid {
      int ieee64big = (format == std::string("IEEE64BIG"));
      int ieee64    = (format == std::string("IEEE64"));
-      uint32_t csum;
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      // depending on datatype, set up munger;
      // munger is a function of <floating point, Real, data_type>
      if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
      if ( ieee32 || ieee32big ) {
 #ifdef PARALLEL_READ
 	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 #else
 	csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
 #endif
      }
      if ( ieee64 || ieee64big ) {
 #ifdef PARALLEL_READ
 	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 #else 
 	csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
 	  (Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
 #endif
      }
      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
 	if ( ieee32 || ieee32big ) {
-#ifdef PARALLEL_READ
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-	  csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+	    (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
+	     nersc_csum,scidac_csuma,scidac_csumb);
 #else
 	  csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
 #endif
 	}
 	if ( ieee64 || ieee64big ) {
-#ifdef PARALLEL_READ
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	  csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+	    (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
+	     nersc_csum,scidac_csuma,scidac_csumb);
-#else
+	}
-	  csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
+      } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
-	    (Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format);
+	if ( ieee32 || ieee32big ) {
-#endif
+	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
 	if ( ieee64 || ieee64big ) {
 	  BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
 	    (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
 	     nersc_csum,scidac_csuma,scidac_csumb);
 	}
      } else {
 	assert(0);
      }
-      NerscStatistics<GaugeField>(Umu,clone);
+      GaugeStatistics(Umu,clone);
-      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<            csum<< std::dec
+      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
 	       <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl;
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
 	       <<" header    "<<header.plaquette<<std::endl;
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
 	       <<" header    "<<header.link_trace<<std::endl;
      if ( fabs(clone.plaquette -header.plaquette ) >=  1.0e-5 ) { 
 	std::cout << " Plaquette mismatch "<<std::endl;
 	std::cout << Umu[0]<<std::endl;
 	std::cout << Umu[1]<<std::endl;
      }
      if ( nersc_csum != header.checksum ) { 
 	std::cerr << " checksum mismatch " << std::endl;
 	std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
 	std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
 	std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
 	exit(0);
      }
      assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
      assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
-      assert(csum == header.checksum );
+      assert(nersc_csum == header.checksum );
      std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
-      }
+    }
      template<class vsimd>
-      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,std::string file, int two_row,int bits32)
+      static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
 					    std::string file, 
 					    int two_row,
 					    int bits32)
      {
 	typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 	typedef iLorentzColourMatrix<vsimd> vobj;
 	typedef typename vobj::scalar_object sobj;
 	FieldMetaData header;
 	///////////////////////////////////////////
 	// Following should become arguments
-	NerscField header;
+	///////////////////////////////////////////
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
@ -425,45 +231,32 @@ namespace Grid {
 	GridBase *grid = Umu._grid;
-	NerscGrid(grid,header);
+	GridMetaData(grid,header);
-	NerscStatistics<GaugeField>(Umu,header);
+	assert(header.nd==4);
-	NerscMachineCharacteristics(header);
+	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);
 	uint32_t csum;
 	int offset;
 	truncate(file);
-	if ( two_row ) { 
+	// Sod it -- always write 3x3 double
 	header.floating_point = std::string("IEEE64BIG");
 	header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 	GaugeSimpleUnmunger<fobj3D,sobj> munge;
 	offset = writeHeader(header,file);
-	  header.floating_point = std::string("IEEE64BIG");
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-	  header.data_type      = std::string("4D_SU3_GAUGE");
+	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
-	  Nersc3x2unmunger<fobj2D,sobj> munge;
+								  nersc_csum,scidac_csuma,scidac_csumb);
-	  BinaryIO::Uint32Checksum<vobj,fobj2D>(Umu, munge,header.checksum);
+	header.checksum = nersc_csum;
-	  offset = writeHeader(header,file);
+	writeHeader(header,file);
 #ifdef PARALLEL_WRITE
 	  csum=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
 #else
 	  csum=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point);
 #endif
 	} else { 
 	  header.floating_point = std::string("IEEE64BIG");
 	  header.data_type      = std::string("4D_SU3_GAUGE_3x3");
 	  NerscSimpleUnmunger<fobj3D,sobj> munge;
 	  BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
 	  offset = writeHeader(header,file);
 #ifdef PARALLEL_WRITE
 	  csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 #else
 	  csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
 #endif
 	}
-	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
+	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<" plaq "<< header.plaquette <<std::endl;
      }
      ///////////////////////////////
      // RNG state
      ///////////////////////////////
@ -472,19 +265,19 @@ namespace Grid {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
 	// Following should become arguments
-	NerscField header;
+	FieldMetaData header;
 	header.sequence_number = 1;
 	header.ensemble_id     = "UKQCD";
 	header.ensemble_label  = "DWF";
 	GridBase *grid = parallel._grid;
-	NerscGrid(grid,header);
+	GridMetaData(grid,header);
 	assert(header.nd==4);
 	header.link_trace=0.0;
 	header.plaquette=0.0;
-	NerscMachineCharacteristics(header);
+	MachineCharacteristics(header);
 	uint32_t csum;
 	int offset;
 #ifdef RNG_RANLUX
@ -502,15 +295,19 @@ namespace Grid {
 	truncate(file);
 	offset = writeHeader(header,file);
-	csum=BinaryIO::writeRNGSerial(serial,parallel,file,offset);
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-	header.checksum = csum;
+	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
 	header.checksum = nersc_csum;
 	offset = writeHeader(header,file);
-	std::cout<<GridLogMessage <<"Written NERSC RNG STATE "<<file<< " checksum "<<std::hex<<csum<<std::dec<<std::endl;
+	std::cout<<GridLogMessage 
 		 <<"Written NERSC RNG STATE "<<file<< " checksum "
 		 <<std::hex<<header.checksum
 		 <<std::dec<<std::endl;
      }
-      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,NerscField& header,std::string file)
+      static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
      {
 	typedef typename GridParallelRNG::RngStateType RngStateType;
@ -518,7 +315,7 @@ namespace Grid {
 	int offset = readHeader(file,grid,header);
-	NerscField clone(header);
+	FieldMetaData clone(header);
 	std::string format(header.floating_point);
 	std::string data_type(header.data_type);
@ -538,15 +335,19 @@ namespace Grid {
 	// depending on datatype, set up munger;
 	// munger is a function of <floating point, Real, data_type>
-	uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
+	uint32_t nersc_csum,scidac_csuma,scidac_csumb;
 	BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
-	assert(csum == header.checksum );
+	if ( nersc_csum != header.checksum ) { 
 	  std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
 	  exit(0);
 	}
 	assert(nersc_csum == header.checksum );
 	std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
      }
    };
  }}
 #endif
--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef AVX512
+#ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
-    //    assert(fabs(omega[i])>0.0);
+    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0;i<Ls;i++){
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    //    assert(fabs(bee[i])>0.0);
+    assert(bee[i]!=Coeff_t(0.0));
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
    beo[i]=as[i]*bs[i];
    ceo[i]=-as[i]*cs[i];
@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    if ( i < Ls-1 ) {
      assert(bee[i]!=Coeff_t(0.0));
      assert(bee[0]!=Coeff_t(0.0));
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
      }
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  { 
    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
-      //      assert(fabs(bee[j])>0.0);
+      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
    }
    dee[Ls-1] += delta_d;
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -644,19 +644,16 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    INHERIT_GIMPL_TYPES(Gimpl);
    template <typename vtype> using iImplScalar            = iScalar<iScalar<iScalar<vtype> > >;
    template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
    template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
    template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
    typedef iImplScalar<Simd>            SiteComplex;
    typedef iImplSpinor<Simd>            SiteSpinor;
    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
    typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
    typedef iImplPropagator<Simd>        SitePropagator;
    typedef Lattice<SiteComplex>           ComplexField;
    typedef Lattice<SiteSpinor>            FermionField;
    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
    typedef Lattice<SitePropagator> PropagatorField;
@ -775,7 +772,6 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    INHERIT_GIMPL_TYPES(Gimpl);
    template <typename vtype> using iImplScalar            = iScalar<iScalar<iScalar<vtype> > >;
    template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >;
    template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >;
    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
@ -792,12 +788,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
    typedef Lattice<SitePropagator> PropagatorField;
    typedef iImplScalar<Simd>            SiteComplex;
    typedef iImplSpinor<Simd>            SiteSpinor;
    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
    typedef Lattice<SiteComplex>           ComplexField;
    typedef Lattice<SiteSpinor>            FermionField;
    typedef SimpleCompressor<SiteSpinor> Compressor;
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
  Compressor compressor;
  int LLs = in._grid->_rdimensions[0];
  st.HaloExchange(in,compressor);
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime += usecond();
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
    }
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
  std::vector<int> latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _FourDimGrid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime    = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@ -55,6 +55,16 @@ namespace QCD {
      FermionField _tmp;
      FermionField &tmp(void) { return _tmp; }
      ////////////////////////////////////////
      // Performance monitoring
      ////////////////////////////////////////
      void Report(void);
      void ZeroCounters(void);
      double DhopTotalTime;
      double DhopCalls;
      double DhopCommTime;
      double DhopComputeTime;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 template<class vobj,class cobj>
 class WilsonStencil : public CartesianStencil<vobj,cobj> {
 public:
-
+  double timer0;
  double timer1;
  double timer2;
  double timer3;
  double timer4;
  double timer5;
  double timer6;
  uint64_t callsi;
  void ZeroCountersi(void)
  {
    timer0=0;
    timer1=0;
    timer2=0;
    timer3=0;
    timer4=0;
    timer5=0;
    timer6=0;
    callsi=0;
  }
  void Reporti(int calls)
  {
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  std::vector<int> same_node;
@ -252,6 +278,7 @@ public:
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
    same_node(npoints)
  { 
    ZeroCountersi();
    surface_list.resize(0);
  };
@ -261,7 +288,6 @@ public:
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      same_node[point] = this->SameNode(point);
      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
    }
    for(int site = 0 ;site< vol4;site++){
@ -282,17 +308,28 @@ public:
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->HaloExchangeOptGather(source,compress);
-    this->CommunicateBegin(reqs);
+    double t1=usecond();
-    this->CommunicateComplete(reqs);
+    // Asynchronous MPI calls multidirectional, Isend etc...
    //    this->CommunicateBegin(reqs);
    //    this->CommunicateComplete(reqs);
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
    this->CommsMerge(compress);
    double t3=usecond(); timer2 += t3-t2;
    this->CommsMergeSHM(compress);
    double t4=usecond(); timer3 += t4-t3;
  }
  template <class compressor>
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
  {
    this->Prepare();
    double t0=usecond();
    this->HaloGatherOpt(source,compress);
    double t1=usecond();
    timer0 += t1-t0;
    callsi++;
  }
  template <class compressor>
@ -304,7 +341,9 @@ public:
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
    this->mpi3synctime_g-=usecond();
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();
    assert(source._grid==this->_grid);
    this->halogtime-=usecond();
@ -323,7 +362,6 @@ public:
    int dag = compress.dag;
    int face_idx=0;
    if ( dag ) { 
      //	std::cout << " Optimised Dagger compress " <<std::endl;
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  int vol4;
  vol4=FourDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4=FourDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
-  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
+  RealD NP     = _FourDimGrid->_Nprocessors;
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NN     = _FourDimGrid->NodeCount();
-    RealD NP = _FourDimGrid->_Nprocessors;
+  RealD volume = Ls;  
-    RealD NN = _FourDimGrid->NodeCount();
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  Compressor compressor(dag);
@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  std::vector<std::vector<CommsRequest_t> > reqs;
  // Rely on async comms; start comms before merge of local data
  DhopCommTime-=usecond();
  st.CommunicateBegin(reqs);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+=usecond();
-  // Perhaps use omp task and region
+  double ctime=0;
-#pragma omp parallel 
+  double ptime=0;
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Ugly explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
+    int ncomms = CartesianCommunicator::nCommThreads;
-    int myoff, mywork;
+    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
      nthreads -= ncomms;
      int ttid = tid - ncomms;
      int n = U._grid->oSites();
      int chunk = n / nthreads;
      int rem = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
 	myblock = ttid * chunk + ttid;
 	myn = chunk+1;
      } else {
 	myblock = ttid*chunk + rem;
 	myn = chunk;
      }
-    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
+      // do the compute
-    int sF = LLs * myoff;
+      if (dag == DaggerYes) {
-
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-    if ( me == 0 ) {
+	  int sU = ss;
-      st.CommunicateComplete(reqs);
+	  int sF = LLs * sU;
-      DhopCommTime+=usecond();
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-    } else { 
+	}
-      // Interior links in stencil
+      } else {
-      if ( me==1 ) DhopComputeTime-=usecond();
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sU = ss;
-      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sF = LLs * sU;
-      if ( me==1 ) DhopComputeTime+=usecond();
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 	}
      }
 	ptime = usecond() - start;
    }
    {
      double start = usecond();
      st.CommunicateThreaded();
      ctime = usecond() - start;
    }
  }
  DhopCommTime += ctime;
  DhopComputeTime+=ptime;
  // First to enter, last to leave timing
  st.CollateThreads();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  // Load imbalance alert. Should use dynamic schedule OMP for loop
  // Perhaps create a list of only those sites with face work, and 
  // load balance process the list.
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 #else 
  assert(0);
 #endif
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
--- a/lib/qcd/action/gauge/GaugeImplTypes.h
+++ b/lib/qcd/action/gauge/GaugeImplTypes.h
@ -40,12 +40,15 @@ namespace QCD {
  typedef typename GImpl::Simd Simd;                \
  typedef typename GImpl::LinkField GaugeLinkField; \
  typedef typename GImpl::Field GaugeField;         \
  typedef typename GImpl::ComplexField ComplexField;\
  typedef typename GImpl::SiteField SiteGaugeField; \
  typedef typename GImpl::SiteComplex SiteComplex;  \
  typedef typename GImpl::SiteLink SiteGaugeLink;
-#define INHERIT_FIELD_TYPES(Impl)             \
+#define INHERIT_FIELD_TYPES(Impl)		    \
-  typedef typename Impl::Simd Simd;           \
+  typedef typename Impl::Simd Simd;		    \
-  typedef typename Impl::SiteField SiteField; \
+  typedef typename Impl::ComplexField ComplexField; \
  typedef typename Impl::SiteField SiteField;	    \
  typedef typename Impl::Field Field;
 // hardcodes the exponential approximation in the template
@ -53,14 +56,17 @@ template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplType
 public:
  typedef S Simd;
-  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation>>>;
+  template <typename vtype> using iImplScalar     = iScalar<iScalar<iScalar<vtype> > >;
-  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation>>, Nd>;
+  template <typename vtype> using iImplGaugeLink  = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
  template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
  typedef iImplScalar<Simd>     SiteComplex;
  typedef iImplGaugeLink<Simd>  SiteLink;
  typedef iImplGaugeField<Simd> SiteField;
-  typedef Lattice<SiteLink>  LinkField;
+  typedef Lattice<SiteComplex> ComplexField;
-  typedef Lattice<SiteField> Field;
+  typedef Lattice<SiteLink>    LinkField; 
  typedef Lattice<SiteField>   Field;
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@ -41,11 +41,13 @@ namespace QCD{
    template <typename vtype>
    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
-    typedef iImplGaugeLink<Simd> SiteLink;
+    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
    typedef SiteField             SiteComplex;
-    typedef Lattice<SiteLink> LinkField;
+    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
  typedef QedGimpl<vComplex> QedGimplR;
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/lib/qcd/action/scalar/Scalar.h
@ -31,6 +31,7 @@ directory
 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
 #include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
 namespace Grid {
 namespace QCD {
@ -39,6 +40,10 @@ namespace QCD {
  typedef ScalarAction<ScalarImplF>                 ScalarActionF;
  typedef ScalarAction<ScalarImplD>                 ScalarActionD;
  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
 }
 }
--- a/lib/qcd/action/scalar/ScalarAction.h
+++ b/lib/qcd/action/scalar/ScalarAction.h
@ -6,10 +6,10 @@
  Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+  Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: paboyle <paboyle@ph.ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -36,49 +36,48 @@ directory
 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  template <class Impl>
+template <class Impl>
-  class ScalarAction : public QCD::Action<typename Impl::Field> {
+class ScalarAction : public QCD::Action<typename Impl::Field> {
-  public:
+ public:
    INHERIT_FIELD_TYPES(Impl);
-  private:
+ private:
    RealD mass_square;
    RealD lambda;
-  public:
+ public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
+    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
-    virtual std::string LogParameters(){
+    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
    }
    virtual std::string action_name() {return "ScalarAction";}
-    virtual std::string action_name(){return "ScalarAction";}
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
    virtual void refresh(const Field &U,
 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
    virtual RealD S(const Field &p) {
      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
+    ScalarObs<Impl>::sumphider(p);
    };
    virtual void deriv(const Field &p,
-		       Field &force) {
+                       Field &force) {
      Field tmp(p._grid);
      Field p2(p._grid);
      ScalarObs<Impl>::phisquared(p2, p);
      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
+    }
-  };
+};
-} // Grid
+
 }  // namespace Grid
 #endif // SCALAR_ACTION_H
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@ -5,9 +5,9 @@
 namespace Grid {
  //namespace QCD {
-  template <class S>
+template <class S>
-  class ScalarImplTypes {
+class ScalarImplTypes {
-  public:
+ public:
    typedef S Simd;
    template <typename vtype>
@ -15,8 +15,10 @@ namespace Grid {
    typedef iImplField<Simd> SiteField;
    typedef SiteField        SitePropagator;
    typedef SiteField        SiteComplex;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
    typedef Field              FermionField;
    typedef Field              PropagatorField;
@ -26,11 +28,11 @@ namespace Grid {
    static inline Field projectForce(Field& P){return P;}
-    static inline void update_field(Field& P, Field& U, double ep){
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    static inline RealD FieldSquareNorm(Field& U){
+    static inline RealD FieldSquareNorm(Field& U) {
      return (- sum(trace(U*U))/2.0);
    }
@ -88,42 +90,49 @@ namespace Grid {
  };
  template <class S, unsigned int N>
-  class ScalarMatrixImplTypes {
+  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
    typedef QCD::SU<N> Group;
    template <typename vtype>
-    using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
+    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
-    typedef iImplField<Simd> SiteField;
+    typedef iImplField<Simd>   SiteField;
    typedef SiteField          SitePropagator;
    typedef iImplComplex<Simd> SiteComplex;
    typedef Lattice<SiteField>   Field;
    typedef Lattice<SiteComplex> ComplexField;
    typedef Field                FermionField;
    typedef Field                PropagatorField;
-    typedef Lattice<SiteField> Field;
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
-    
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
    }
-    static inline Field projectForce(Field& P){return P;}
+    static inline Field projectForce(Field& P) {return P;}
-    static inline void update_field(Field& P, Field& U, double ep){
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    static inline RealD FieldSquareNorm(Field& U){
+    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(- sum(trace(U*U))*0.5).real());
+      return (TensorRemove(sum(trace(U*U))).real());
    }
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
    }
    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
    }
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = 1.0;
+      U = zero;
    }
  };
@ -138,6 +147,15 @@ namespace Grid {
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
  // Hardcoding here the size of the matrices
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
  //}
 }
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@ -6,10 +6,7 @@
  Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@ -30,55 +27,122 @@ directory
  *************************************************************************************/
 /*  END LEGAL */
-#ifndef SCALAR_ACTION_H
+#ifndef SCALAR_INT_ACTION_H
-#define SCALAR_ACTION_H
+#define SCALAR_INT_ACTION_H
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure
 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  template <class Impl>
+  template <class Impl, int Ndim >
  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
  public:
    INHERIT_FIELD_TYPES(Impl);
  private:
    RealD mass_square;
    RealD lambda;
  public:
    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
-    virtual std::string LogParameters(){
+    typedef typename Field::vector_object vobj;
    typedef CartesianStencil<vobj,vobj> Stencil;
    SimpleCompressor<vobj> compressor;
    int npoint = 2*Ndim;
    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
  public:
    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
      for (int mu = 0 ; mu < Ndim; mu++){
 		directions[mu]         = mu; directions[mu+Ndim]    = mu;
 		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
      }
    }
    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
    }
-    virtual std::string action_name(){return "ScalarAction";}
+    virtual std::string action_name() {return "ScalarAction";}
-    virtual void refresh(const Field &U,
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
    virtual RealD S(const Field &p) {
-      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
+      assert(p._grid->Nd() == Ndim);
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-	ScalarObs<Impl>::sumphider(p);
+      phiStencil.HaloExchange(p, compressor);
      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
      phisquared = p*p;
      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
      for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 	  int permute_type;
 	  StencilEntry *SE;
 	  vobj temp2;
 	  const vobj *temp, *t_p;
 	  SE = phiStencil.GetEntry(permute_type, mu, i);
 	  t_p  = &p._odata[i];
 	  if ( SE->_is_local ) {
 	    temp = &p._odata[SE->_offset];
 	    if ( SE->_permute ) {
 	      permute(temp2, *temp, permute_type);
 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 	    } else {
 	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 	    }
 	  } else {
 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
 	  }
 	}
 	//  action -= pshift*p + p*pshift;
      }
      // NB the trace in the algebra is normalised to 1/2
      // minus sign coming from the antihermitian fields
      return -(TensorRemove(sum(trace(action)))).real();
    };
-    virtual void deriv(const Field &p,
+    virtual void deriv(const Field &p, Field &force) {
-		       Field &force) {
+      assert(p._grid->Nd() == Ndim);
-      Field tmp(p._grid);
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-      Field p2(p._grid);
+      // move this outside
-      ScalarObs<Impl>::phisquared(p2, p);
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
+      phiStencil.HaloExchange(p, compressor);
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-    };
+      for (int point = 0; point < npoint; point++) {
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 	  const vobj *temp;
 	  vobj temp2;
 	  int permute_type;
 	  StencilEntry *SE;
 	  SE = phiStencil.GetEntry(permute_type, point, i);
 	  if ( SE->_is_local ) {
 	    temp = &p._odata[SE->_offset];
 	    if ( SE->_permute ) {
 	      permute(temp2, *temp, permute_type);
 	      force._odata[i] -= temp2;
 	    } else {
 	      force._odata[i] -= *temp;
 	    }
 	  } else {
 	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
 	  }
 	}
      }
    }
  };
-} // Grid
+}  // namespace Grid
-#endif // SCALAR_ACTION_H
+#endif  // SCALAR_INT_ACTION_H
--- a/lib/qcd/hmc/GenericHMCrunner.h
+++ b/lib/qcd/hmc/GenericHMCrunner.h
@ -207,6 +207,12 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator,
 typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
    ScalarGenericHMCRunner;
 typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
    ScalarAdjGenericHMCRunner;
 template <int Colours> 
 using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
 }  // namespace QCD
 }  // namespace Grid
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
-  	std::cout << "Reading HMC\n";
+  	std::cout << GridLogMessage << "Reading HMC\n";
  	read(TheReader, "HMC", *this);
  }
--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@ -165,7 +165,7 @@ class HMCResourceManager {
  // Grids
  //////////////////////////////////////////////////////////////
-  void AddGrid(std::string s, GridModule& M) {
+  void AddGrid(const std::string s, GridModule& M) {
    // Check for name clashes
    auto search = Grids.find(s);
    if (search != Grids.end()) {
@ -174,14 +174,24 @@ class HMCResourceManager {
      exit(1);
    }
    Grids[s] = std::move(M);
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
    Grids[s].show_full_decomposition();
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
  }
  // Add a named grid set, 4d shortcut
-  void AddFourDimGrid(std::string s) {
+  void AddFourDimGrid(const std::string s) {
    GridFourDimModule<vComplex> Mod;
    AddGrid(s, Mod);
  }
  // Add a named grid set, 4d shortcut + tweak simd lanes
  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
    GridFourDimModule<vComplex> Mod(simd_decomposition);
    AddGrid(s, Mod);
  }
  GridCartesian* GetCartesian(std::string s = "") {
@ -253,6 +263,7 @@ class HMCResourceManager {
  template<class T, class... Types>
  void AddObservable(Types&&... Args){
    ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
    ObservablesList.back()->print_parameters();
  }
  std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
--- a/lib/qcd/hmc/HMC_GridModules.h
+++ b/lib/qcd/hmc/HMC_GridModules.h
@ -43,11 +43,12 @@ public:
  std::string, lattice,
  std::string, mpi);
-  std::vector<int> getLattice(){return strToVec<int>(lattice);}
+  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
-  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
+  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}
-  void check(){
+
-    if (getLattice().size() != getMpi().size()) {
+  void check() const {
    if (getLattice().size() != getMpi().size() ) {
      std::cout << GridLogError
                << "Error in GridModuleParameters: lattice and mpi dimensions "
                   "do not match"
@ -84,6 +85,8 @@ class GridModule {
  void set_full(GridCartesian* grid) { grid_.reset(grid); }
  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
  void show_full_decomposition(){ grid_->show_decomposition(); }
  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
 protected:
  std::unique_ptr<GridCartesian> grid_;
@ -95,31 +98,72 @@ class GridModule {
 // Classes for the user
 ////////////////////////////////////
 // Note: the space time grid should be out of the QCD namespace
-template< class vector_type>
+template <class vector_type>
-class GridFourDimModule : public GridModule {
+class GridFourDimModule : public GridModule
- public:
+{
-  GridFourDimModule() {
+public:
  GridFourDimModule()
  {
    using namespace QCD;
    set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
+        GridDefaultLatt(), 
        GridDefaultSimd(4, vector_type::Nsimd()),
        GridDefaultMpi()));
    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
  }
-  GridFourDimModule(GridModuleParameters Params) {
+  GridFourDimModule(const std::vector<int> tweak_simd)
  {
    using namespace QCD;
    if (tweak_simd.size() != 4)
    {
      std::cout << GridLogError
                << "Error in GridFourDimModule: SIMD size different from 4" 
                << std::endl;
      exit(1);
    }
    // Checks that the product agrees with the expectation
    int simd_sum = 1;
    for (auto &n : tweak_simd)
      simd_sum *= n;
    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
    if (simd_sum == vector_type::Nsimd())
    {
      set_full(SpaceTimeGrid::makeFourDimGrid(
          GridDefaultLatt(), 
          tweak_simd, 
          GridDefaultMpi()));
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
    }
    else
    {
      std::cout << GridLogError 
                << "Error in GridFourDimModule: SIMD lanes must sum to " 
                << vector_type::Nsimd() 
                << std::endl;
    }
  }
  GridFourDimModule(const GridModuleParameters Params)
  {
    using namespace QCD;
    Params.check();
    std::vector<int> lattice_v = Params.getLattice();
    std::vector<int> mpi_v = Params.getMpi();
-    if (lattice_v.size() == 4) {
+    if (lattice_v.size() == 4)
    {
      set_full(SpaceTimeGrid::makeFourDimGrid(
-          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
+          lattice_v, 
          GridDefaultSimd(4, vector_type::Nsimd()),
          mpi_v));
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    } else {
+    }
    else
    {
      std::cout << GridLogError
-          << "Error in GridFourDimModule: lattice dimension different from 4"
+                << "Error in GridFourDimModule: lattice dimension different from 4"
-          << std::endl;
+                << std::endl;
      exit(1);
    }
  }
--- a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
@ -62,36 +62,50 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
    fout.close();
  }
-  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
+  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
-                          GridParallelRNG &pRNG) {
+
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
-      BinaryIO::BinarySimpleUnmunger<sobj_double, sobj> munge;
+      uint32_t nersc_csum;
      uint32_t scidac_csuma;
      uint32_t scidac_csumb;
      BinarySimpleUnmunger<sobj_double, sobj> munge;
      truncate(rng);
-      BinaryIO::writeRNGSerial(sRNG, pRNG, rng, 0);
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
      truncate(config);
-      uint32_t csum = BinaryIO::writeObjectParallel<vobj, sobj_double>(
+
-          U, config, munge, 0, Params.format);
+      BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
 						      nersc_csum,scidac_csuma,scidac_csumb);
      std::cout << GridLogMessage << "Written Binary Configuration " << config
-                << " checksum " << std::hex << csum << std::dec << std::endl;
+                << " checksum " << std::hex 
 		<< nersc_csum   <<"/"
 		<< scidac_csuma   <<"/"
 		<< scidac_csumb 
 		<< std::dec << std::endl;
    }
  };
-  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
+  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
                         GridParallelRNG &pRNG) {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
-    BinaryIO::BinarySimpleMunger<sobj_double, sobj> munge;
+    BinarySimpleMunger<sobj_double, sobj> munge;
-    BinaryIO::readRNGSerial(sRNG, pRNG, rng, 0);
+
-    uint32_t csum = BinaryIO::readObjectParallel<vobj, sobj_double>(
+    uint32_t nersc_csum;
-        U, config, munge, 0, Params.format);
+    uint32_t scidac_csuma;
    uint32_t scidac_csumb;
    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
    BinaryIO::readLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
 						   nersc_csum,scidac_csuma,scidac_csumb);
    std::cout << GridLogMessage << "Read Binary Configuration " << config
-              << " checksum " << std::hex << csum << std::dec << std::endl;
+              << " checksums " << std::hex << nersc_csum<<"/"<<scidac_csuma<<"/"<<scidac_csumb 
 	      << std::dec << std::endl;
  };
 };
 }
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@ -54,9 +54,9 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    // check here that the format is valid
    int ieee32big = (Params.format == std::string("IEEE32BIG"));
-    int ieee32 = (Params.format == std::string("IEEE32"));
+    int ieee32    = (Params.format == std::string("IEEE32"));
    int ieee64big = (Params.format == std::string("IEEE64BIG"));
-    int ieee64 = (Params.format == std::string("IEEE64"));
+    int ieee64    = (Params.format == std::string("IEEE64"));
    if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
      std::cout << GridLogError << "Unrecognized file format " << Params.format
@ -75,12 +75,19 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
-      ILDGIO IO(config, ILDGwrite);
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-      BinaryIO::writeRNGSerial(sRNG, pRNG, rng, 0);
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      uint32_t csum = IO.writeConfiguration(U, Params.format);
+      IldgWriter _IldgWriter;
      _IldgWriter.open(config);
      _IldgWriter.writeConfiguration(U, traj, config, config);
      _IldgWriter.close();
      std::cout << GridLogMessage << "Written ILDG Configuration on " << config
-                << " checksum " << std::hex << csum << std::dec << std::endl;
+                << " checksum " << std::hex 
 		<< nersc_csum<<"/"
 		<< scidac_csuma<<"/"
 		<< scidac_csumb
 		<< std::dec << std::endl;
    }
  };
@ -89,12 +96,21 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
-    ILDGIO IO(config, ILDGread);
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    BinaryIO::readRNGSerial(sRNG, pRNG, rng, 0);
+    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-    uint32_t csum = IO.readConfiguration(U);  // format from the header
+
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
    _IldgReader.readConfiguration(U,header);  // format from the header
    _IldgReader.close();
    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
-              << " checksum " << std::hex << csum << std::dec << std::endl;
+              << " checksum " << std::hex 
 	      << nersc_csum<<"/"
 	      << scidac_csuma<<"/"
 	      << scidac_csumb
 	      << std::dec << std::endl;
  };
 };
 }
--- a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/NerscCheckpointer.h
@ -70,7 +70,7 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
-    NerscField header;
+    FieldMetaData header;
    NerscIO::readRNGState(sRNG, pRNG, header, rng);
    NerscIO::readConfiguration(U, header, config);
  };
--- a/lib/qcd/modules/ObservableModules.h
+++ b/lib/qcd/modules/ObservableModules.h
@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors
  // acquire resource
  virtual void initialize(){
    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };
 template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
+class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
+  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors
  // acquire resource
  virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
+    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
  }
  public:
-  TopologicalChargeMod(): ObsBase(NoParameters()){}
+  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
  TopologicalChargeMod(): ObsBase(){}
 };
 }// QCD temporarily here
--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@ -33,9 +33,45 @@ directory
 namespace Grid {
 namespace QCD {
 struct TopologySmearingParameters : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
    int, steps,
    float, step_size,
    int, meas_interval,
    float, maxTau);
    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
    template < class ReaderClass >
    TopologySmearingParameters(Reader<ReaderClass>& Reader){
        read(Reader, "Smearing", *this);  
    }  
 };
 struct TopologyObsParameters : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
      int, interval,
      bool, do_smearing,
      TopologySmearingParameters, Smearing);  
    TopologyObsParameters(int interval = 1, bool smearing = false):
        interval(interval), Smearing(smearing){}
    template <class ReaderClass >
      TopologyObsParameters(Reader<ReaderClass>& Reader){
        read(Reader, "TopologyMeasurement", *this);
  }
 };
 // this is only defined for a gauge theory
 template <class Impl>
 class TopologicalCharge : public HmcObservable<typename Impl::Field> {
    TopologyObsParameters Pars;
 public:
    // here forces the Impl to be of gauge fields
    // if not the compiler will complain
@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
    // necessary for HmcObservable compatibility
    typedef typename Impl::Field Field;
    TopologicalCharge(int interval = 1, bool do_smearing = false):
        Pars(interval, do_smearing){}
    TopologicalCharge(TopologyObsParameters P):Pars(P){
        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
    }
    void TrajectoryComplete(int traj,
                            Field &U,
                            GridSerialRNG &sRNG,
                            GridParallelRNG &pRNG) {
-    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
+    if (traj%Pars.interval == 0){
        // Smearing
        Field Usmear = U;
        int def_prec = std::cout.precision();
-    int def_prec = std::cout.precision();
+        if (Pars.do_smearing){
            // using wilson flow by default here
            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
            Real T0   = WF.energyDensityPlaquette(Usmear);
            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
        }
-    std::cout << GridLogMessage
+        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+        std::cout << GridLogMessage
-        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-    std::cout.precision(def_prec);
+        std::cout.precision(def_prec);
        }
    }
 };
--- a/lib/qcd/representations/hmc_types.h
+++ b/lib/qcd/representations/hmc_types.h
@ -62,7 +62,10 @@ class Representations {
 typedef Representations<FundamentalRepresentation> NoHirep;
 typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields;
-  //typedef Representations<EmptyRep<typename ScalarMatrixImplR::Field> > ScalarMatrixFields;
+typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields;
 template < int Colours> 
 using ScalarNxNMatrixFields = Representations<EmptyRep<typename ScalarNxNAdjImplR<Colours>::Field> >;
 // Helper classes to access the elements
 // Strips the first N parameters from the tuple
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    if (maxTau - taus < epsilon){
        epsilon = maxTau-taus;
    }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
    GaugeField Z(U._grid);
    GaugeField Zprime(U._grid);
    GaugeField tmp(U._grid), Uprime(U._grid);
@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    // adjust integration step
    taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 }
@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
    out = in;
    for (unsigned int step = 1; step <= Nstep; step++) {
        auto start = std::chrono::high_resolution_clock::now();
        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
        evolve_step(out);
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> diff = end - start;
@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
    unsigned int step = 0;
    do{
        step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
        evolve_step_adaptive(out, maxTau);
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
            << step << "  "
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@ -0,0 +1,193 @@
    /*************************************************************************************
    grid` physics library, www.github.com/paboyle/Grid 
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //#include <Grid/Grid.h>
 #ifndef GRID_QCD_GAUGE_FIX_H
 #define GRID_QCD_GAUGE_FIX_H
 namespace Grid {
 namespace QCD {
 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
    for(int mu=0;mu<Nd;mu++){
      Complex cmi(0.0,-1.0);
      A[mu] = Ta(U[mu]) * cmi;
    }
  }
  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
    dmuAmu=zero;
    for(int mu=0;mu<Nd;mu++){
      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
    }
  }  
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
    GridBase *grid = Umu._grid;
    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
    Real old_trace = org_link_trace;
    Real trG;
    std::vector<GaugeMat> U(Nd,grid);
                 GaugeMat dmuAmu(grid);
    for(int i=0;i<maxiter;i++){
      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
      if ( Fourier==false ) { 
 	trG = SteepestDescentStep(U,alpha,dmuAmu);
      } else { 
 	trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
      }
      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
      // Monitor progress and convergence test 
      // infrequently to minimise cost overhead
      if ( i %20 == 0 ) { 
 	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
 	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
 	if (Fourier) 
 	  std::cout << GridLogMessage << "Fourier Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	else 
 	  std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	Real Phi  = 1.0 - old_trace / link_trace ;
 	Real Omega= 1.0 - trG;
 	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
 	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
 	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
 	  return;
 	}
 	old_trace = link_trace;
      }
    }
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
    GridBase *grid = U[0]._grid;
    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
    GaugeLinkToLieAlgebraField(U,A);
    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    SU<Nc>::GaugeTransform(U,g);
    return trG;
  }
  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
    GridBase *grid = U[0]._grid;
    Real vol = grid->gSites();
    FFT theFFT((GridCartesian *)grid);
    LatticeComplex  Fp(grid);
    LatticeComplex  psq(grid); psq=zero;
    LatticeComplex  pmu(grid); 
    LatticeComplex   one(grid); one = Complex(1.0,0.0);
    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
    std::vector<GaugeMat> A(Nd,grid);
    GaugeLinkToLieAlgebraField(U,A);
    DmuAmu(A,dmuAmu);
    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
    //////////////////////////////////
    // Work out Fp = psq_max/ psq...
    //////////////////////////////////
    std::vector<int> latt_size = grid->GlobalDimensions();
    std::vector<int> coor(grid->_ndimension,0);
    for(int mu=0;mu<Nd;mu++) {
      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
      LatticeCoordinate(pmu,mu);
      pmu = TwoPiL * pmu ;
      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
    }
    Complex psqMax(16.0);
    Fp =  psqMax*one/psq;
    /*
    static int once;
    if ( once == 0 ) { 
      std::cout << " Fp " << Fp <<std::endl;
      once ++;
      }*/
    pokeSite(TComplex(1.0),Fp,coor);
    dmuAmu_p  = dmuAmu_p * Fp; 
    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
    GaugeMat ciadmam(grid);
    Complex cialpha(0.0,-alpha);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    SU<Nc>::GaugeTransform(U,g);
    return trG;
  }
  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
    GridBase *grid = g._grid;
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
    DmuAmu(A,dmuAmu);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
 };
 }
 }
 #endif
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
    for (int a = 0; a < AdjointDimension; a++) {
      generator(a, Ta);
-      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
+      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
      pokeColour(h_out, tmp, a);
    }
  }
--- a/lib/qcd/utils/Utils.h
+++ b/lib/qcd/utils/Utils.h
@ -12,7 +12,4 @@
 #include <Grid/qcd/utils/SUnAdjoint.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>
 #endif
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@ -73,7 +73,7 @@ public:
  //////////////////////////////////////////////////
  // trace of directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
-  static void traceDirPlaquette(LatticeComplex &plaq,
+  static void traceDirPlaquette(ComplexField &plaq,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
@ -83,9 +83,9 @@ public:
  //////////////////////////////////////////////////
  // sum over all planes of plaquette
  //////////////////////////////////////////////////
-  static void sitePlaquette(LatticeComplex &Plaq,
+  static void sitePlaquette(ComplexField &Plaq,
                            const std::vector<GaugeMat> &U) {
-    LatticeComplex sitePlaq(U[0]._grid);
+    ComplexField sitePlaq(U[0]._grid);
    Plaq = zero;
    for (int mu = 1; mu < Nd; mu++) {
      for (int nu = 0; nu < mu; nu++) {
@ -104,11 +104,11 @@ public:
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
-    LatticeComplex Plaq(Umu._grid);
+    ComplexField Plaq(Umu._grid);
    sitePlaquette(Plaq, U);
-    TComplex Tp = sum(Plaq);
+    auto Tp = sum(Plaq);
-    Complex p = TensorRemove(Tp);
+    auto p = TensorRemove(Tp);
    return p.real();
  }
@ -129,15 +129,15 @@ public:
  static RealD linkTrace(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(Nd, Umu._grid);
-    LatticeComplex Tr(Umu._grid);
+    ComplexField Tr(Umu._grid);
    Tr = zero;
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
      Tr = Tr + trace(U[mu]);
    }
-    TComplex Tp = sum(Tr);
+    auto Tp = sum(Tr);
-    Complex p = TensorRemove(Tp);
+    auto p = TensorRemove(Tp);
    double vol = Umu._grid->gSites();
@ -355,8 +355,8 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    double coeff = 8.0/(32.0*M_PI*M_PI);
-    LatticeComplex qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
+    ComplexField qfield = coeff*trace(Bx*Ex + By*Ey + Bz*Ez);
-    TComplex Tq = sum(qfield);
+    auto Tq = sum(qfield);
    return TensorRemove(Tq).real();
  }
@ -375,16 +375,16 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
               adj(Gimpl::CovShiftForward(
                   U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
  }
-  static void traceDirRectangle(LatticeComplex &rect,
+  static void traceDirRectangle(ComplexField &rect,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
    dirRectangle(sp, U, mu, nu);
    rect = trace(sp);
  }
-  static void siteRectangle(LatticeComplex &Rect,
+  static void siteRectangle(ComplexField &Rect,
                            const std::vector<GaugeMat> &U) {
-    LatticeComplex siteRect(U[0]._grid);
+    ComplexField siteRect(U[0]._grid);
    Rect = zero;
    for (int mu = 1; mu < Nd; mu++) {
      for (int nu = 0; nu < mu; nu++) {
@ -404,12 +404,12 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
-    LatticeComplex Rect(Umu._grid);
+    ComplexField Rect(Umu._grid);
    siteRectangle(Rect, U);
-    TComplex Tp = sum(Rect);
+    auto Tp = sum(Rect);
-    Complex p = TensorRemove(Tp);
+    auto p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
--- a/lib/serialisation/Hdf5IO.cc
+++ b/lib/serialisation/Hdf5IO.cc
@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
                      Hdf5Type<unsigned int>::type());
 }
-void Hdf5Reader::push(const std::string &s)
+bool Hdf5Reader::push(const std::string &s)
 {
  group_ = group_.openGroup(s);
  path_.push_back(s);
  return true;
 }
 void Hdf5Reader::pop(void)
--- a/lib/serialisation/Hdf5IO.h
+++ b/lib/serialisation/Hdf5IO.h
@ -54,7 +54,7 @@ namespace Grid
  public:
    Hdf5Reader(const std::string &fileName);
    virtual ~Hdf5Reader(void) = default;
-    void push(const std::string &s);
+    bool push(const std::string &s);
    void pop(void);
    template <typename U>
    void readDefault(const std::string &s, U &output);
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@ -110,11 +110,12 @@ THE SOFTWARE.
 #define GRID_MACRO_MEMBER(A,B)        A B;
 #define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B));
-#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
+#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" " #B << " = " << obj. B << " ; " <<std::endl;
 #define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
 #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\
  std::string SerialisableClassName(void) {return std::string(#cname);}	\
 GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
 template <typename T>\
 static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@ -32,16 +32,21 @@ using namespace Grid;
 using namespace std;
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const string &fileName)
+XmlWriter::XmlWriter(const string &fileName, string toplev) : fileName_(fileName)
 : fileName_(fileName)
 {
-  node_ = doc_.append_child();
+  if ( toplev == std::string("") ) {
-  node_.set_name("grid");
+    node_=doc_;
  } else { 
    node_=doc_.append_child();
    node_.set_name(toplev.c_str());
  }
 }
 XmlWriter::~XmlWriter(void)
 {
-  doc_.save_file(fileName_.c_str(), "  ");
+  if ( fileName_ != std::string("") ) { 
    doc_.save_file(fileName_.c_str(), "  ");
  }
 }
 void XmlWriter::push(const string &s)
@ -53,21 +58,44 @@ void XmlWriter::pop(void)
 {
  node_ = node_.parent();
 }
-
+std::string XmlWriter::XmlString(void)
 // Reader implementation ///////////////////////////////////////////////////////
 XmlReader::XmlReader(const string &fileName)
 : fileName_(fileName)
 {
-  pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
+  std::ostringstream oss; 
  doc_.save(oss);
  return oss.str();
 }
-  if ( !result )
+XmlReader::XmlReader(const char *xmlstring,string toplev) : fileName_("")
-  {
+{
  pugi::xml_parse_result result;
  result = doc_.load_string(xmlstring);
  if ( !result ) {
    cerr << "XML error description: " << result.description() << "\n";
    cerr << "XML error offset     : " << result.offset        << "\n";
    abort();
  }
  if ( toplev == std::string("") ) {
    node_ = doc_;
  } else { 
    node_ = doc_.child(toplev.c_str());
  }
 }
-  node_ = doc_.child("grid");
+// Reader implementation ///////////////////////////////////////////////////////
 XmlReader::XmlReader(const string &fileName,string toplev) : fileName_(fileName)
 {
  pugi::xml_parse_result result;
  result = doc_.load_file(fileName_.c_str());
  if ( !result ) {
    cerr << "XML error description: " << result.description() << "\n";
    cerr << "XML error offset     : " << result.offset        << "\n";
    abort();
  }
  if ( toplev == std::string("") ) {
    node_ = doc_;
  } else { 
    node_ = doc_.child(toplev.c_str());
  }
 }
 bool XmlReader::push(const string &s)
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@ -45,9 +45,8 @@ namespace Grid
  class XmlWriter: public Writer<XmlWriter>
  {    
  public:
-    XmlWriter(const std::string &fileName);
+    XmlWriter(const std::string &fileName,std::string toplev = std::string("grid") );
    virtual ~XmlWriter(void);
    void push(const std::string &s);
    void pop(void);
@ -55,6 +54,7 @@ namespace Grid
    void writeDefault(const std::string &s, const U &x);
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
    std::string XmlString(void);
  private:
    pugi::xml_document doc_;
    pugi::xml_node     node_;
@ -64,7 +64,8 @@ namespace Grid
  class XmlReader: public Reader<XmlReader>
  {
  public:
-    XmlReader(const std::string &fileName);
+    XmlReader(const char *xmlstring,std::string toplev = std::string("grid") );
    XmlReader(const std::string &fileName,std::string toplev = std::string("grid") );
    virtual ~XmlReader(void) = default;
    bool push(const std::string &s);
    void pop(void);
@ -118,7 +119,7 @@ namespace Grid
    std::string buf;
    readDefault(s, buf);
-    std::cout << s << "   " << buf << std::endl;
+    //    std::cout << s << "   " << buf << std::endl;
    fromString(output, buf);
  }
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@ -701,9 +701,28 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
-    // FIXME unimplemented
+    __m128i ret;
-    printf("Reduce : Missing integer implementation -> FIX\n");
+#if defined (AVX2)
-    assert(0);
+    // AVX2 horizontal adds within upper and lower halves of register; use
    // SSE to add upper and lower halves for result.
    __m256i v1, v2;
    __m128i u1, u2;
    v1  = _mm256_hadd_epi32(in, in);
    v2  = _mm256_hadd_epi32(v1, v1);
    u1  = _mm256_castsi256_si128(v2);      // upper half
    u2  = _mm256_extracti128_si256(v2, 1); // lower half
    ret = _mm_add_epi32(u1, u2);
 #else
    // No AVX horizontal add; extract upper and lower halves of register & use
    // SSE intrinsics.
    __m128i u1, u2, u3;
    u1  = _mm256_extractf128_si256(in, 0); // upper half
    u2  = _mm256_extractf128_si256(in, 1); // lower half
    u3  = _mm_add_epi32(u1, u2);
    u1  = _mm_hadd_epi32(u3, u3);
    ret = _mm_hadd_epi32(u1, u1);
 #endif
    return _mm_cvtsi128_si32(ret);
  }
 }
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@ -543,6 +543,24 @@ namespace Optimization {
     u512d conv; conv.v = v1;
     return conv.f[0];
  }
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
    // No full vector reduce, use AVX to add upper and lower halves of register
    // and perform AVX reduction.
    __m256i v1, v2, v3;
    __m128i u1, u2, ret;
    v1  = _mm512_castsi512_si256(in);       // upper half
    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
    v3  = _mm256_add_epi32(v1, v2);
    v1  = _mm256_hadd_epi32(v3, v3);
    v2  = _mm256_hadd_epi32(v1, v1);
    u1  = _mm256_castsi256_si128(v2)        // upper half
    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
    ret = _mm_add_epi32(u1, u2);
    return _mm_cvtsi128_si32(ret);
  }
 #else
  //Complex float Reduce
  template<>
@ -570,9 +588,7 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // FIXME unimplemented
+    return _mm512_reduce_add_epi32(in);
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 #endif
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@ -401,9 +401,7 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // FIXME unimplemented
+    return _mm512_reduce_add_epi32(in);
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
@ -6,8 +6,9 @@
    Copyright (C) 2015
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Nils Meyer <nils.meyer@ur.de>
-Author: neo <cossu@post.kek.jp>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_sse4.h
  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
-  Experimental - Using intrinsics - DEVELOPING! 
+/*
  ARMv8 NEON intrinsics layer by
  Nils Meyer <nils.meyer@ur.de>,
  University of Regensburg, Germany
  SFB/TRR55
 */
 // Time-stamp: <2015-07-10 17:45:09 neo>
 //----------------------------------------------------------------------
 #ifndef GEN_SIMD_WIDTH
 #define GEN_SIMD_WIDTH 16u
 #endif
 #include "Grid_generic_types.h"
 #include <arm_neon.h>
-// ARMv8 supports double precision
+namespace Grid {
 namespace Optimization {
  template<class vtype>
@ -46,14 +53,18 @@ namespace Optimization {
    float32x4_t f;
    vtype v;
  };
  union u128f {
    float32x4_t v;
    float f[4];
  };
  union u128d {
    float64x2_t v;
-    double f[4];
+    double f[2];
  };
  // half precision
  union u128h {
    float16x8_t v;
    uint16_t f[8];
  };
  struct Vsplat{
@ -64,20 +75,20 @@ namespace Optimization {
    }
    // Real float
    inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
    }
    //Complex double
-    inline float32x4_t operator()(double a, double b){
+    inline float64x2_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
+      double tmp[2]={a,b};
-      return vld1q_f32(tmp);
+      return vld1q_f64(tmp);
    }
-    //Real double
+    //Real double // N:tbc
-    inline float32x4_t operator()(double a){
+    inline float64x2_t operator()(double a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f64(a);
    }
-    //Integer
+    //Integer // N:tbc
    inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
    }
  };
@ -87,8 +98,8 @@ namespace Optimization {
      vst1q_f32(F, a);
    }
    //Double
-    inline void operator()(float32x4_t a, double* D){
+    inline void operator()(float64x2_t a, double* D){
-      vst1q_f32((float*)D, a);
+      vst1q_f64(D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
@ -97,54 +108,54 @@ namespace Optimization {
  };
-  struct Vstream{
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
-    //Float
+    //Float // N:generic
    inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
    }
-    //Double
+    //Double // N:generic
-    inline void operator()(double * a, float32x4_t b){
+    inline void operator()(double * a, float64x2_t b){
-  
+      memcpy(a,&b,2*sizeof(double));
    }
  };
  // Nils: Vset untested; not used currently in Grid at all;
  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
  struct Vset{
-    // Complex float 
+    // Complex float // N:ok
    inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
-      return foo;
+      return vld1q_f32(tmp);
    }
-    // Complex double 
+    // Complex double // N:ok
-    inline float32x4_t operator()(Grid::ComplexD *a){
+    inline float64x2_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
+      double tmp[2]={a[0].imag(),a[0].real()};
-      return foo;
+      return vld1q_f64(tmp);
    }
-    // Real float 
+    // Real float // N:ok
    inline float32x4_t operator()(float *a){
-      float32x4_t foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
-      return foo;
+      return vld1q_f32(tmp);
    }
-    // Real double
+    // Real double // N:ok
-    inline float32x4_t operator()(double *a){
+    inline float64x2_t operator()(double *a){
-      float32x4_t foo;
+      double tmp[2]={a[1],a[0]};
-      return foo;
+      return vld1q_f64(tmp);
    }
-    // Integer
+    // Integer // N:ok
    inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
+      return vld1q_dup_u32(a);
      return foo;
    }
  };
  // N:leaving as is
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
    //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
      printf("Error, using wrong Reduce function\n");
      exit(1);
      return 0;
@ -184,26 +195,98 @@ namespace Optimization {
    }
  };
  struct MultRealPart{
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      float32x4_t re = vtrn1q_f32(a, a);
      return vmulq_f32(re, b);
    }
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
      float64x2_t re = vzip1q_f64(a, a);
      return vmulq_f64(re, b);
    }
  };
  struct MaddRealPart{
    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
      float32x4_t re = vtrn1q_f32(a, a);
      return vfmaq_f32(c, re, b);
    }
    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
      float64x2_t re = vzip1q_f64(a, a);
      return vfmaq_f64(c, re, b);
    }
  };
  struct Div{
    // Real float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vdivq_f32(a, b);
    }
    // Real double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
      return vdivq_f64(a, b);
    }
  };
  struct MultComplex{
    // Complex float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
+
-      return foo;
+      float32x4_t r0, r1, r2, r3, r4;
      // a = ar ai Ar Ai
      // b = br bi Br Bi
      // collect real/imag part, negate bi and Bi
      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
      // the fun part
      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
      // fma(a,b,c) = a+b*c
      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
      // no fma, use mul and add
      //float32x4_t r5;
      //r5 = vmulq_f32(r0, a);
      //return vaddq_f32(r4, r5);
    }
    // Complex double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
+
-      return foo;
+      float64x2_t r0, r1, r2, r3, r4;
      // b = br bi
      // collect real/imag part, negate bi
      r0 = vtrn1q_f64(b, b);       //  br  br
      r1 = vnegq_f64(b);           // -br -bi
      r2 = vtrn2q_f64(b, r1);      //  bi -bi
      // the fun part
      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
      // fma(a,b,c) = a+b*c
      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
      // no fma, use mul and add
      //float64x2_t r5;
      //r5 = vmulq_f64(r0, a);
      //return vaddq_f64(r4, r5);
    }
  };
  struct Mult{
    // Real float
    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
      return vfmaq_f32(a, b, c);
    }
    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
      return vfmaq_f64(a, b, c);
    }
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vmulq_f32(a,b);
@ -221,89 +304,275 @@ namespace Optimization {
  struct Conj{
    // Complex single
    inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
      float32x4_t r0, r1;
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
+    inline float64x2_t operator()(float64x2_t in){
-    // return 0;
+
-    //}
+      float64x2_t r0, r1;
      r0 = vextq_f64(in, in, 1);    //  ai  ar
      r1 = vnegq_f64(r0);           // -ai -ar
      return vextq_f64(r0, r1, 1);  //  ar -ai
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
      float32x4_t r0, r1;
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
-    //  return in;
+      // a ib -> b -ia
-    //}
+      float64x2_t tmp;
-
+      tmp = vnegq_f64(in);
-
+      return vextq_f64(in, tmp, 1);
    }
  };
  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
+      // ar ai br bi -> -ai ar -bi br
-      return in;
+      float32x4_t r0, r1;
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
-    //  return 0;
+      // a ib -> -b ia
-    //}
+      float64x2_t tmp;
      tmp = vnegq_f64(in);
      return vextq_f64(tmp, in, 1);
    }
  };
  struct Permute{
    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
      // AB CD -> CD AB
      return vextq_f32(in, in, 2);
    };
    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
      // AB CD -> BA DC
      return vrev64q_f32(in);
    };
    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
      return in;
    };
    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
      return in;
    };
    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
      // AB -> BA
      return vextq_f64(in, in, 1);
    };
    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
      return in;
    };
    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
      return in;
    };
    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
      return in;
    };
  };
  struct Rotate{
    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
      switch(n){
      case 0: // AB CD -> AB CD
        return tRotate<0>(in);
        break;
      case 1: // AB CD -> BC DA
        return tRotate<1>(in);
        break;
      case 2: // AB CD -> CD AB
        return tRotate<2>(in);
        break;
      case 3: // AB CD -> DA BC
        return tRotate<3>(in);
        break;
      default: assert(0);
      }
    }
    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
      switch(n){
      case 0: // AB -> AB
        return tRotate<0>(in);
        break;
      case 1: // AB -> BA
        return tRotate<1>(in);
        break;
      default: assert(0);
      }
    }
 // working, but no restriction on n
 //    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
 //    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
 // restriction on n
    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
  };
  struct PrecisionChange {
    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
      float16x4_t h = vcvt_f16_f32(a);
      return vcvt_high_f16_f32(h, b);
    }
    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
      sb = vcvt_high_f32_f16(h);
      // there is no direct conversion from lower float32x4_t to float64x2_t
      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
      // workaround for clang
      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
      sa = vcvt_high_f32_f16(h1);
    }
    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
      float32x2_t s = vcvt_f32_f64(a);
      return vcvt_high_f32_f64(s, b);
    }
    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
      b = vcvt_high_f64_f32(s);
      // there is no direct conversion from lower float32x4_t to float64x2_t
      float32x4_t s1 = vextq_f32(s, s, 2);
      a = vcvt_high_f64_f32(s1);
    }
    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
      float32x4_t s1 = DtoS(a, b);
      float32x4_t s2 = DtoS(c, d);
      return StoH(s1, s2);
    }
    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
      float32x4_t s1, s2;
      HtoS(h, s1, s2);
      StoD(s1, a, b);
      StoD(s2, c, d);
    }
  };
  //////////////////////////////////////////////
  // Exchange support
  struct Exchange{
    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      // in1: ABCD -> out1: ABEF
      // in2: EFGH -> out2: CDGH
      // z: CDAB
      float32x4_t z = vextq_f32(in1, in1, 2);
      // out1: ABEF
      out1 = vextq_f32(z, in2, 2);
      // z: GHEF
      z = vextq_f32(in2, in2, 2);
      // out2: CDGH
      out2 = vextq_f32(in1, z, 2);
    };
    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      // in1: ABCD -> out1: AECG
      // in2: EFGH -> out2: BFDH
      out1 = vtrn1q_f32(in1, in2);
      out2 = vtrn2q_f32(in1, in2);
    };
    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      assert(0);
      return;
    };
    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
      assert(0);
      return;
    };
    // double precision
    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      // in1: AB -> out1: AC
      // in2: CD -> out2: BD
      out1 = vzip1q_f64(in1, in2);
      out2 = vzip2q_f64(in1, in2);
    };
    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      assert(0);
      return;
    };
    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      assert(0);
      return;
    };
    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
      assert(0);
      return;
    };
  };
  //////////////////////////////////////////////
  // Some Template specialization
  template < typename vtype > 
    void permute(vtype &a, vtype b, int perm) {
  }; 
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
    v1 = Optimization::Permute::Permute0(in);
    v1 = vaddq_f32(v1,in);
    u128f conv;    conv.v=v1;
    return Grid::ComplexF(conv.f[0],conv.f[1]);
  }
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
+    return vaddvq_f32(in);
    float32x2_t low = vget_low_f32(in);
    float32x2_t tmp = vadd_f32(low, high);
    float32x2_t sum = vpadd_f32(tmp, tmp);
    return vget_lane_f32(sum,0);
  }
  //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
    return Grid::ComplexD(conv.f[0],conv.f[1]);
  }
  //Real double Reduce
  template<>
  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
+    return vaddvq_f64(in);
    return vgetq_lane_f64(sum,0);
  }
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types
 namespace Grid {
 // typedef Optimization::vech SIMD_Htype; // Reduced precision type
  typedef float16x8_t  SIMD_Htype; // Half precision type
  typedef float32x4_t  SIMD_Ftype; // Single precision type
  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type
@ -312,13 +581,6 @@ namespace Grid {
  inline void prefetch_HINT_T0(const char *ptr){};
  // Gpermute function
  template < typename VectorSIMD > 
    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
    Optimization::permute(y.v,b.v,perm);
  }
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@ -332,8 +594,11 @@ namespace Grid {
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@ -374,6 +374,84 @@ namespace Optimization {
    // Complex float
    FLOAT_WRAP_2(operator(), inline)
  };
 #define USE_FP16
  struct PrecisionChange {
    static inline vech StoH (const vector4float &a, const vector4float &b) {
      vech ret;
      std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
      assert(0);
      return ret;
    }
    static inline void  HtoS (vech h, vector4float &sa, vector4float &sb) {
      std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
      assert(0);
    }
    static inline vector4float DtoS (vector4double a, vector4double b) {
      vector4float ret;
      std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
      assert(0);
      return ret;
    }
    static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
      std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
      assert(0);
    }
    static inline vech DtoH (vector4double a, vector4double b, 
                             vector4double c, vector4double d) {
      vech ret;
      std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
      assert(0);
      return ret;
    }
    static inline void HtoD (vech h, vector4double &a, vector4double &b, 
                                     vector4double &c, vector4double &d) {
      std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
      assert(0);
    }
  };
  //////////////////////////////////////////////
  // Exchange support
 #define FLOAT_WRAP_EXCHANGE(fn) \
  static inline void fn(vector4float &out1, vector4float &out2, \
                        vector4float in1,  vector4float in2) \
  { \
    vector4double out1d, out2d, in1d, in2d; \
    in1d  = Vset()(in1);   \
    in2d  = Vset()(in2);   \
    fn(out1d, out2d, in1d, in2d); \
    Vstore()(out1d, out1); \
    Vstore()(out2d, out2); \
  }
  struct Exchange{
    // double precision
    static inline void Exchange0(vector4double &out1, vector4double &out2,
                                 vector4double in1,  vector4double in2) {
      out1 = vec_perm(in1, in2, vec_gpci(0145));
      out2 = vec_perm(in1, in2, vec_gpci(02367));
    }
    static inline void Exchange1(vector4double &out1, vector4double &out2,
                                 vector4double in1,  vector4double in2) {
      out1 = vec_perm(in1, in2, vec_gpci(0426));
      out2 = vec_perm(in1, in2, vec_gpci(01537));
    }
    static inline void Exchange2(vector4double &out1, vector4double &out2,
                                 vector4double in1,  vector4double in2) {
      assert(0);
    }
    static inline void Exchange3(vector4double &out1, vector4double &out2,
                                 vector4double in1,  vector4double in2) {
      assert(0);
    }
    // single precision
    FLOAT_WRAP_EXCHANGE(Exchange0);
    FLOAT_WRAP_EXCHANGE(Exchange1);
    FLOAT_WRAP_EXCHANGE(Exchange2);
    FLOAT_WRAP_EXCHANGE(Exchange3);
  };
  struct Permute{
    //Complex double
@ -497,15 +575,19 @@ namespace Optimization {
  //Integer Reduce
  template<>
-  inline Integer Reduce<Integer, int>::operator()(int in){
+  inline Integer Reduce<Integer, veci>::operator()(veci in){
-    // FIXME unimplemented
+    Integer a = 0;
-    printf("Reduce : Missing integer implementation -> FIX\n");
+    for (unsigned int i = 0; i < W<Integer>::r; ++i)
-    assert(0);
+    {
        a += in.v[i];
    }
    return a;
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Here assign types
 typedef Optimization::vech         SIMD_Htype;  // Half precision type
 typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
 typedef vector4double              SIMD_Dtype; // Double precision type
 typedef Optimization::veci         SIMD_Itype; // Integer type
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@ -570,9 +570,9 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
-    // FIXME unimplemented
+    __m128i v1 = _mm_hadd_epi32(in, in);
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    __m128i v2 = _mm_hadd_epi32(v1, v1);
-    assert(0);
+    return _mm_cvtsi128_si32(v2);
  }
 }
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX
@ -327,10 +327,6 @@ class Grid_simd {
  // provides support
  ///////////////////////////////////////
  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
  //#pragma GCC push_options 
  //#pragma GCC optimize ("O0") 
  //#endif
  template <class functor>
  friend inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
    Grid_simd ret;
@ -364,9 +360,6 @@ class Grid_simd {
    ret.v = cx.v;
    return ret;
  }
  //#if (__GNUC__ == 5 ) || ( ( __GNUC__ == 6 ) && __GNUC_MINOR__ < 3 )
  //#pragma GCC pop_options
  //#endif
  ///////////////////////
  // Exchange 
  // Al Ah , Bl Bh -> Al Bl Ah,Bh
@ -428,7 +421,6 @@ class Grid_simd {
 };  // end of Grid_simd class definition
 inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
 inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
 inline void permute(RealD &y,RealD b, int perm) {  y=b; }
@ -759,8 +751,8 @@ inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
  conv.v = in.v;
  for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
-    assert(conv.s[i + 1] ==
+    assert(conv.s[i + 1] == conv.s[i]);  
-           conv.s[i]);  // trap any cases where real was not duplicated
+    // trap any cases where real was not duplicated
    // indicating the SIMD grids of real and imag assignment did not correctly
    // match
    conv.s[i + 1] = 0.0;  // zero imaginary parts
@ -838,8 +830,6 @@ inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionCha
 inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
 inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
 // Check our vector types are of an appropriate size.
 #if defined QPX
 static_assert(2*sizeof(SIMD_Ftype) == sizeof(SIMD_Dtype), "SIMD vector lengths incorrect");
@ -854,21 +844,14 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 /////////////////////////////////////////
 template <typename T>
 struct is_simd : public std::false_type {};
-template <>
+template <> struct is_simd<vRealF>     : public std::true_type {};
-struct is_simd<vRealF> : public std::true_type {};
+template <> struct is_simd<vRealD>     : public std::true_type {};
-template <>
+template <> struct is_simd<vComplexF>  : public std::true_type {};
-struct is_simd<vRealD> : public std::true_type {};
+template <> struct is_simd<vComplexD>  : public std::true_type {};
-template <>
+template <> struct is_simd<vInteger>   : public std::true_type {};
 struct is_simd<vComplexF> : public std::true_type {};
 template <>
 struct is_simd<vComplexD> : public std::true_type {};
 template <>
 struct is_simd<vInteger> : public std::true_type {};
-template <typename T>
+template <typename T> using IfSimd    = Invoke<std::enable_if<is_simd<T>::value, int> >;
-using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
+template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 template <typename T>
 using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
 }
 #endif
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@ -179,13 +179,6 @@ inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
 ////////////////////////////////////////////////////////////////////////////
 // Allows us to assign into **conformable** real vectors from complex
 ////////////////////////////////////////////////////////////////////////////
 //  template < class S, class V >
 //  inline auto ComplexRemove(const Grid_simd<S,V> &c) ->
 //  Grid_simd<Grid_simd<S,V>::Real,V> {
 //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
 //    ret.v = c.v;
 //    return ret;
 //  }
 template <class scalar>
 struct AndFunctor {
  scalar operator()(const scalar &x, const scalar &y) const { return x & y; }
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 int LebesgueOrder::UseLebesgueOrder;
 #ifdef KNL
 std::vector<int> LebesgueOrder::Block({8,2,2,2});
-
+#else
 std::vector<int> LebesgueOrder::Block({2,2,2,2});
 #endif
 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
  n--;           // 1000 0011 --> 1000 0010
  n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011
@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
  if ( Block[0]==0) ZGraph();
  else if ( Block[1]==0) NoBlocking();
  else CartesianBlocking();
 }
  if (0) {
    std::cout << "Thread Interleaving"<<std::endl;
    ThreadInterleave();
  } 
 }
 void LebesgueOrder::ThreadInterleave(void)
 {
  std::vector<IndexInteger> reorder = _LebesgueReorder;
  std::vector<IndexInteger> throrder;
  int vol = _LebesgueReorder.size();
  int threads = GridThread::GetThreads();
  int blockbits=3;
  int blocklen = 8;
  int msk      = 0x7;
  for(int t=0;t<threads;t++){
    for(int ss=0;ss<vol;ss++){
       if ( ( ss >> blockbits) % threads == t ) { 
         throrder.push_back(reorder[ss]);
       }
    }
  }
  _LebesgueReorder = throrder;
 }
 void LebesgueOrder::NoBlocking(void) 
 {
  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@ -70,6 +70,8 @@ namespace Grid {
 		  std::vector<IndexInteger> & xi,
 		  std::vector<IndexInteger> &dims);
    void ThreadInterleave(void);
  private:
    std::vector<IndexInteger> _LebesgueReorder;
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  // Timing info; ugly; possibly temporary
  /////////////////////////////////////////
  double commtime;
  double mpi3synctime;
  double mpi3synctime_g;
  double shmmergetime;
  double gathertime;
  double gathermtime;
  double halogtime;
@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  double splicetime;
  double nosplicetime;
  double calls;
  std::vector<double> comm_bytes_thr;
  std::vector<double> comm_time_thr;
  std::vector<double> comm_enter_thr;
  std::vector<double> comm_leave_thr;
  ////////////////////////////////////////
  // Stencil query
@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  //////////////////////////////////////////
  void CommunicateThreaded()
  {
 #ifdef GRID_OMP
    // must be called in parallel region
    int mythread = omp_get_thread_num();
    int nthreads = CartesianCommunicator::nCommThreads;
 #else
    int mythread = 0;
    int nthreads = 1;
 #endif
    if (nthreads == -1) nthreads = 1;
    if (mythread < nthreads) {
      comm_enter_thr[mythread] = usecond();
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 						      Packets[i].to_rank,
 						      Packets[i].recv_buf,
 						      Packets[i].from_rank,
 						      Packets[i].bytes,i);
 	comm_bytes_thr[mythread] += bytes;
      }
      comm_leave_thr[mythread]= usecond();
      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
    }
  }
  void CollateThreads(void)
  {
    int nthreads = CartesianCommunicator::nCommThreads;
    double first=0.0;
    double last =0.0;
    for(int t=0;t<nthreads;t++) {
      double t0 = comm_enter_thr[t];
      double t1 = comm_leave_thr[t];
      comms_bytes+=comm_bytes_thr[t];
      comm_enter_thr[t] = 0.0;
      comm_leave_thr[t] = 0.0;
      comm_time_thr[t]   = 0.0;
      comm_bytes_thr[t]=0;
      if ( first == 0.0 ) first = t0;                   // first is t0
      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
      if ( t1 > last ) last = t1;                       // max time seen
    }
    commtime+= last-first;
  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
      comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
-					  Packets[i].send_buf,
+						     Packets[i].send_buf,
-					  Packets[i].to_rank,
+						     Packets[i].to_rank,
-					  Packets[i].recv_buf,
+						     Packets[i].recv_buf,
-					  Packets[i].from_rank,
+						     Packets[i].from_rank,
-					  Packets[i].bytes);
+						     Packets[i].bytes,i);
    }
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i]);
+      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
    commtime+=usecond();
  }
  void Communicate(void)
  {
 #ifdef GRID_OMP
 #pragma omp parallel 
    {
      // must be called in parallel region
      int mythread  = omp_get_thread_num();
      int maxthreads= omp_get_max_threads();
      int nthreads = CartesianCommunicator::nCommThreads;
      assert(nthreads <= maxthreads);
      if (nthreads == -1) nthreads = 1;
 #else
      int mythread = 0;
      int nthreads = 1;
 #endif
      if (mythread < nthreads) {
 	for (int i = mythread; i < Packets.size(); i += nthreads) {
 	  double start = usecond();
 	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 								   Packets[i].to_rank,
 								   Packets[i].recv_buf,
 								   Packets[i].from_rank,
 								   Packets[i].bytes,i);
 	  comm_time_thr[mythread] += usecond() - start;
 	}
      }
 #ifdef GRID_OMP
    }
 #endif
  }
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    Prepare();
    HaloGather(source,compress);
-    CommunicateBegin(reqs);
+    // Concurrent
-    CommunicateComplete(reqs);
+    //CommunicateBegin(reqs);
    //CommunicateComplete(reqs);
    // Sequential, possibly threaded
    Communicate();
    CommsMergeSHM(compress); 
    CommsMerge(compress); 
  }
@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    mpi3synctime_g-=usecond();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime_g+=usecond();
    // conformable(source._grid,_grid);
    assert(source._grid==_grid);
@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
    CommsMerge(decompress,Mergers,Decompressions); 
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
    mpi3synctime-=usecond();    
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime+=usecond();    
    shmmergetime-=usecond();    
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
    shmmergetime+=usecond();    
  }
  template<class decompressor>
@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 		  int checkerboard,
 		  const std::vector<int> &directions,
 		  const std::vector<int> &distances) 
-   :   _permute_type(npoints), _comm_buf_size(npoints)
+   : _permute_type(npoints), 
    _comm_buf_size(npoints),
    comm_bytes_thr(npoints), 
    comm_enter_thr(npoints),
    comm_leave_thr(npoints), 
       comm_time_thr(npoints)
  {
    face_table_computed=0;
    _npoints = npoints;
@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  void ZeroCounters(void) {
    gathertime = 0.;
    commtime = 0.;
    mpi3synctime=0.;
    mpi3synctime_g=0.;
    shmmergetime=0.;
    for(int i=0;i<_npoints;i++){
      comm_time_thr[i]=0;
      comm_bytes_thr[i]=0;
      comm_enter_thr[i]=0;
      comm_leave_thr[i]=0;
    }
    halogtime = 0.;
    mergetime = 0.;
    decompresstime = 0.;
@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
    double t = 0;
    // if comm_time_thr is set they were all done in parallel so take the max
    // but add up the bytes
    int threaded = 0 ;
    for (int i = 0; i < 8; ++i) {
      if ( comm_time_thr[i]>0.0 ) {
 	threaded = 1;
 	comms_bytes += comm_bytes_thr[i];
 	if (t < comm_time_thr[i]) t = comm_time_thr[i];
      }
    }
    if (threaded) commtime += t;
    _grid->GlobalSum(commtime);    commtime/=NP;
    if ( calls > 0. ) {
@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
      PRINTIT(mpi3synctime);
      PRINTIT(mpi3synctime_g);
      PRINTIT(shmmergetime);
      PRINTIT(splicetime);
      PRINTIT(nosplicetime);
    }
--- a/lib/tensors/Tensor_arith_mul.h
+++ b/lib/tensors/Tensor_arith_mul.h
@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
                 const iVector<vtype,N> * __restrict__ rhs,
                 const iScalar<mtype> * __restrict__ lhs){
-    mult(ret,lhs,rhs);
+    for(int c1=0;c1<N;c1++){
        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
    }                 
 }
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@ -156,11 +156,18 @@ class iScalar {
  // convert from a something to a scalar via constructor of something arg
  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
-    strong_inline iScalar<vtype> operator=(T arg) {
+  strong_inline iScalar<vtype> operator=(T arg) {
    _internal = arg;
    return *this;
  }
  // Convert elements
  template <class ttype>
  strong_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
    _internal = arg._internal;
    return *this;
  }
  friend std::ostream &operator<<(std::ostream &stream,const iScalar<vtype> &o) {
    stream << "S {" << o._internal << "}";
    return stream;
--- a/lib/tensors/Tensor_exp.h
+++ b/lib/tensors/Tensor_exp.h
@ -80,8 +80,11 @@ template<class vtype, int N> inline iVector<vtype, N> Exponentiate(const iVector
      mat iQ2 = arg*arg*alpha*alpha;
      mat iQ3 = arg*iQ2*alpha;   
      // sign in c0 from the conventions on the Ta
-      c0 = -imag( trace(iQ3) ) * one_over_three;  
+      scalar imQ3, reQ2;
-      c1 = -real( trace(iQ2) ) * one_over_two;
+      imQ3 = imag( trace(iQ3) );
      reQ2 = real( trace(iQ2) );
      c0 = -imQ3 * one_over_three;  
      c1 = -reQ2 * one_over_two;
      // Cayley Hamilton checks to machine precision, tested
      tmp = c1 * one_over_three;
--- a/lib/tensors/Tensor_index.h
+++ b/lib/tensors/Tensor_index.h
@ -47,6 +47,28 @@ template<int Level>
 class TensorIndexRecursion {
 public:
  ////////////////////////////////////////////////////
  // Type Queries
  ////////////////////////////////////////////////////
  template<class vtype>       static inline int indexRank(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::indexRank(tmp._internal);  }
  template<class vtype,int N> static inline int indexRank(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::indexRank(tmp._internal[0]);  }
  template<class vtype,int N> static inline int indexRank(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::indexRank(tmp._internal[0][0]);  }
  template<class vtype>       static inline int isScalar(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isScalar(tmp._internal);  }
  template<class vtype,int N> static inline int isScalar(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isScalar(tmp._internal[0]);  }
  template<class vtype,int N> static inline int isScalar(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isScalar(tmp._internal[0][0]);  }
  template<class vtype>       static inline int isVector(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isVector(tmp._internal);  }
  template<class vtype,int N> static inline int isVector(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isVector(tmp._internal[0]);  }
  template<class vtype,int N> static inline int isVector(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isVector(tmp._internal[0][0]);  }
  template<class vtype>       static inline int isMatrix(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal);  }
  template<class vtype,int N> static inline int isMatrix(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal[0]);  }
  template<class vtype,int N> static inline int isMatrix(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal[0][0]);  }
  ////////////////////////////////////////////////////
  // Trace
  ////////////////////////////////////////////////////
  template<class vtype>
  static auto traceIndex(const iScalar<vtype> arg) ->  iScalar<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal))> 
  {
@ -215,6 +237,24 @@ class TensorIndexRecursion {
 template<>
 class TensorIndexRecursion<0> {
 public:
  ////////////////////////////////////////////////////
  // Type Queries
  ////////////////////////////////////////////////////
  template<class vtype>       static inline int indexRank(const iScalar<vtype> tmp)  { return 1; }
  template<class vtype,int N> static inline int indexRank(const iVector<vtype,N> tmp){ return N; }
  template<class vtype,int N> static inline int indexRank(const iMatrix<vtype,N> tmp){ return N; }
  template<class vtype>       static inline int isScalar(const iScalar<vtype> tmp)  { return true;}
  template<class vtype,int N> static inline int isScalar(const iVector<vtype,N> tmp){ return false;}
  template<class vtype,int N> static inline int isScalar(const iMatrix<vtype,N> tmp){ return false;}
  template<class vtype>       static inline int isVector(const iScalar<vtype> tmp)  { return false;}
  template<class vtype,int N> static inline int isVector(const iVector<vtype,N> tmp){ return true;}
  template<class vtype,int N> static inline int isVector(const iMatrix<vtype,N> tmp){ return false;}
  template<class vtype>       static inline int isMatrix(const iScalar<vtype> tmp)  { return false;}
  template<class vtype,int N> static inline int isMatrix(const iVector<vtype,N> tmp){ return false;}
  template<class vtype,int N> static inline int isMatrix(const iMatrix<vtype,N> tmp){ return true;}
  /////////////////////////////////////////
  // Ends recursion for trace (scalar/vector/matrix)
@ -302,6 +342,26 @@ class TensorIndexRecursion<0> {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // External wrappers
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 template<int Level,class vtype> inline int indexRank(void)
 {
  vtype tmp;
  return TensorIndexRecursion<Level>::indexRank(tmp);
 }
 template<int Level,class vtype> inline int isScalar(void)
 {
  vtype tmp;
  return TensorIndexRecursion<Level>::isScalar(tmp);
 }
 template<int Level,class vtype> inline int isVector(void)
 {
  vtype tmp;
  return TensorIndexRecursion<Level>::isVector(tmp);
 }
 template<int Level,class vtype> inline int isMatrix(void)
 {
  vtype tmp;
  return TensorIndexRecursion<Level>::isMatrix(tmp);
 }
 template<int Level,class vtype> inline auto traceIndex (const vtype &arg) -> RemoveCRV(TensorIndexRecursion<Level>::traceIndex(arg))
 {
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@ -281,8 +281,8 @@ namespace Grid {
  template<typename T>
  class getPrecision{
  public:
-    typedef typename getVectorType<T>::type vector_obj; //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
+    //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
-  
+    typedef typename getVectorType<T>::type vector_obj; 
    typedef typename GridTypeMapper<vector_obj>::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types
    typedef typename GridTypeMapper<scalar_type>::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv)
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
    CartesianCommunicator::Hugepages = 1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@ -317,7 +323,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
@ -356,10 +362,15 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
-
+  CartesianCommunicator::nCommThreads = -1;
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
@ -374,10 +385,13 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
-  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
  if ( CartesianCommunicator::Hugepages) {
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
@ -393,7 +407,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@ -0,0 +1,101 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_nersc_io.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
 #ifdef HAVE_LIME
  Grid_init(&argc,&argv);
  std::cout <<GridLogMessage<< " main "<<std::endl;
  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  //std::vector<int> latt_size  ({48,48,48,96});
  //std::vector<int> latt_size  ({32,32,32,32});
  std::vector<int> latt_size  ({16,16,16,32});
  std::vector<int> clatt_size  ({4,4,4,8});
  int orthodir=3;
  int orthosz =latt_size[orthodir];
  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
  GridCartesian     Coarse(clatt_size,simd_layout,mpi_layout);
  GridParallelRNG   pRNGa(&Fine);
  GridParallelRNG   pRNGb(&Fine);
  GridSerialRNG     sRNGa;
  GridSerialRNG     sRNGb;
  std::cout <<GridLogMessage<< " seeding... "<<std::endl;
  pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  std::cout <<GridLogMessage<< " ...done "<<std::endl;
  LatticeGaugeField Umu(&Fine);
  LatticeGaugeField Umu_diff(&Fine);
  LatticeGaugeField Umu_saved(&Fine);
  std::vector<LatticeColourMatrix> U(4,&Fine);
  SU3::HotConfiguration(pRNGa,Umu);
  FieldMetaData header;
  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
  std::cout <<GridLogMessage<<"** Writing out  ILDG conf    *********"<<std::endl;
  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
  std::string file("./ckpoint_ildg.4000");
  IldgWriter _IldgWriter;
  _IldgWriter.open(file);
  _IldgWriter.writeConfiguration(Umu,4000,std::string("dummy_ildg_LFN"),std::string("dummy_config"));
  _IldgWriter.close();
  Umu_saved = Umu;
  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
  std::cout <<GridLogMessage<<"** Reading back ILDG conf    *********"<<std::endl;
  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
  IldgReader _IldgReader;
  _IldgReader.open(file);
  _IldgReader.readConfiguration(Umu,header);
  _IldgReader.close();
  Umu_diff = Umu - Umu_saved;
  std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/IO/Test_ildg_read.cc
+++ b/tests/IO/Test_ildg_read.cc
@ -0,0 +1,117 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_nersc_io.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
 #ifdef HAVE_LIME
  Grid_init(&argc,&argv);
  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> latt_size = GridDefaultLatt();
  int orthodir=3;
  int orthosz =latt_size[orthodir];
  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
  LatticeGaugeField Umu(&Fine);
  std::vector<LatticeColourMatrix> U(4,&Fine);
  FieldMetaData header;
  std::string file("./ildg.file");
  IldgReader IR;
  IR.open(file);
  IR.readConfiguration(Umu,header);
  IR.close();
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  // Painful ; fix syntactical niceness
  LatticeComplex LinkTrace(&Fine);
  LinkTrace=zero;
  for(int mu=0;mu<Nd;mu++){
    LinkTrace = LinkTrace + trace(U[mu]);
  }
  // (1+2+3)=6 = N(N-1)/2 terms
  LatticeComplex Plaq(&Fine);
  Plaq = zero;
  for(int mu=1;mu<Nd;mu++){
    for(int nu=0;nu<mu;nu++){
      Plaq = Plaq + trace(U[mu]*Cshift(U[nu],mu,1)*adj(Cshift(U[mu],nu,1))*adj(U[nu]));
    }
  }
  double vol = Fine.gSites();
  Complex PlaqScale(1.0/vol/6.0/3.0);
  std::cout<<GridLogMessage <<"PlaqScale" << PlaqScale<<std::endl;
  std::vector<TComplex> Plaq_T(orthosz);
  sliceSum(Plaq,Plaq_T,Nd-1);
  int Nt = Plaq_T.size();
  TComplex Plaq_T_sum; 
  Plaq_T_sum=zero;
  for(int t=0;t<Nt;t++){
    Plaq_T_sum = Plaq_T_sum+Plaq_T[t];
    Complex Pt=TensorRemove(Plaq_T[t]);
    std::cout<<GridLogMessage << "sliced ["<<t<<"]" <<Pt*PlaqScale*Real(Nt)<<std::endl;
  }
  {
    Complex Pt = TensorRemove(Plaq_T_sum);
    std::cout<<GridLogMessage << "total " <<Pt*PlaqScale<<std::endl;
  }  
  TComplex Tp = sum(Plaq);
  Complex p  = TensorRemove(Tp);
  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;
  Complex LinkTraceScale(1.0/vol/4.0/3.0);
  TComplex Tl = sum(LinkTrace);
  Complex l  = TensorRemove(Tl);
  std::cout<<GridLogMessage << "calculated link trace " <<l*LinkTraceScale<<std::endl;
  Grid_finalize();
 #endif
 }
--- a/tests/IO/Test_nersc_io.cc
+++ b/tests/IO/Test_nersc_io.cc
@ -38,10 +38,13 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::cout <<GridLogMessage<< " main "<<std::endl;
  std::vector<int> simd_layout = GridDefaultSimd(4,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  std::vector<int> latt_size  ({16,16,16,16});
+  //std::vector<int> latt_size  ({48,48,48,96});
  //std::vector<int> latt_size  ({32,32,32,32});
  std::vector<int> latt_size  ({16,16,16,32});
  std::vector<int> clatt_size  ({4,4,4,8});
  int orthodir=3;
  int orthosz =latt_size[orthodir];
@ -49,30 +52,32 @@ int main (int argc, char ** argv)
  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
  GridCartesian     Coarse(clatt_size,simd_layout,mpi_layout);
  GridParallelRNG   pRNGa(&Fine);
  GridParallelRNG   pRNGb(&Fine);
  GridSerialRNG     sRNGa;
  GridSerialRNG     sRNGb;
  std::cout <<GridLogMessage<< " seeding... "<<std::endl;
  pRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  sRNGa.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  std::cout <<GridLogMessage<< " ...done "<<std::endl;
  std::string rfile("./ckpoint_rng.4000");
  FieldMetaData rngheader;
  NerscIO::writeRNGState(sRNGa,pRNGa,rfile);
  NerscField rngheader;
  NerscIO::readRNGState (sRNGb,pRNGb,rngheader,rfile);
  LatticeComplex tmpa(&Fine); random(pRNGa,tmpa);
  LatticeComplex tmpb(&Fine); random(pRNGb,tmpb);
  tmpa = tmpa - tmpb;
-  std::cout << " difference between restored randoms and orig "<<norm2( tmpa ) <<" / "<< norm2(tmpb)<<std::endl;
+  std::cout <<GridLogMessage<< " difference between restored randoms and orig "<<norm2( tmpa ) <<" / "<< norm2(tmpb)<<std::endl;
  ComplexD a,b;
  random(sRNGa,a);
  random(sRNGb,b);
-  std::cout << " serial RNG numbers "<<a<<" "<<b<<std::endl;
+  std::cout <<GridLogMessage<< " serial RNG numbers "<<a<<" "<<b<<std::endl;
  LatticeGaugeField Umu(&Fine);
  LatticeGaugeField Umu_diff(&Fine);
@ -80,15 +85,20 @@ int main (int argc, char ** argv)
  std::vector<LatticeColourMatrix> U(4,&Fine);
-  SU3::ColdConfiguration(pRNGa,Umu);
+  SU3::HotConfiguration(pRNGa,Umu);
-  NerscField header;
+  FieldMetaData header;
  std::string file("./ckpoint_lat.4000");
  int precision32 = 0;
  int tworow      = 0;
  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  Umu_saved = Umu;
  NerscIO::readConfiguration(Umu,header,file);
  Umu_diff = Umu - Umu_saved;
  //std::cout << "Umu_save "<<Umu_saved[0]<<std::endl;
  //std::cout << "Umu_read "<<Umu[0]<<std::endl;
  std::cout <<GridLogMessage<< "norm2 Gauge Diff = "<<norm2(Umu_diff)<<std::endl;
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@ -115,7 +125,6 @@ int main (int argc, char ** argv)
 #endif
  double vol = Fine.gSites();
  Complex PlaqScale(1.0/vol/6.0/3.0);
  std::cout<<GridLogMessage <<"PlaqScale" << PlaqScale<<std::endl;
  std::vector<TComplex> Plaq_T(orthosz);
  sliceSum(Plaq,Plaq_T,Nd-1);
@ -139,7 +148,6 @@ int main (int argc, char ** argv)
  Complex p  = TensorRemove(Tp);
  std::cout<<GridLogMessage << "calculated plaquettes " <<p*PlaqScale<<std::endl;
  Complex LinkTraceScale(1.0/vol/4.0/3.0);
  TComplex Tl = sum(LinkTrace);
  Complex l  = TensorRemove(Tl);
--- a/tests/IO/Test_nersc_read.cc
+++ b/tests/IO/Test_nersc_read.cc
@ -50,7 +50,7 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(&Fine);
  std::vector<LatticeColourMatrix> U(4,&Fine);
-  NerscField header;
+  FieldMetaData header;
  std::string file("./ckpoint_lat");
  NerscIO::readConfiguration(Umu,header,file);
--- a/Show More
+++ b/Show More