Merge branch 'hotfix/dirac-ITT-fix'

Patching macos compile
Mac os happiness
2026-06-16 00:53:10 +01:00 · 2017-09-05 15:08:16 +01:00 · 2017-09-05 15:07:07 +01:00 · 2017-09-05 15:00:16 +01:00 · 2017-09-05 14:55:54 +01:00 · 2017-09-05 14:30:29 +01:00
137 changed files with 7013 additions and 4318 deletions
@@ -9,68 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
@@ -1,27 +1,44 @@
-# Grid
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 <table>
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
 **Data parallel C++ mathematical object library.**
 License: GPL v2.
-Last update Nov 2016.
+Last update June 2017.
 _Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
 * Identically shaped arrays then be processed with perfect data parallelisation.
 * Such identically shaped arrays are called conformable arrays.
 The transformation is based on the observation that Cartesian array processing involves
 identical processing to be performed on different regions of the Cartesian array.
 The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 Local vector loops are parallelised with OpenMP pragmas.
 Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 ### Compilers
 Intel ICPC v16.0.3 and later
@@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
 6. Attach the output of `make V=1`.
 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 ### Required libraries
 Grid requires:
 [GMP](https://gmplib.org/), 
-### Description
+[MPFR](http://www.mpfr.org/) 
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 array indices to both MPI tasks and SIMD processing elements.
-* Identically shaped arrays then be processed with perfect data parallelisation.
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 * Such identically shaped arrays are called conformable arrays.
-The transformation is based on the observation that Cartesian array processing involves
+Grid optionally uses:
 identical processing to be performed on different regions of the Cartesian array.
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+[HDF5](https://support.hdfgroup.org/HDF5/)  
 Local vector loops are parallelised with OpenMP pragmas.
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
 optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 for most programmers.
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
 Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+LAPACK either generic version or Intel MKL library.
 The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 MPI, OpenMP, and SIMD parallelism are present in the library.
 Please see https://arxiv.org/abs/1512.03487 for more detail.
 ### Quick start
 First, start by cloning the repository:
@@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `none`         | no communications                                             |
 | `mpi[-auto]`   | MPI communications                                            |
 | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
@@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | `AVX2`      | AVX 2 (256 bit)                        |
 | `AVX512`    | AVX 512 bit                            |
-| `QPX`       | QPX (256 bit)                          |
+| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | `QPX`       | IBM QPX (256 bit)                      |
 Alternatively, some CPU codenames can be directly used:
@@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
-             --enable-comms=mpi-auto \
+             --enable-comms=mpi-auto  \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
-where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi       \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Knight's Landing with Intel Omnipath adapters with two adapters per node 
 presently performs better with use of more than one rank per node, using shared memory 
 for interior communication. This is the mpi3 communications implementation. 
 We recommend four ranks per node for best performance, but optimum is local volume dependent.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CC=icpc MPICXX=mpiicpc 
 ```
 ### Build setup for Intel Haswell Xeon platform
 The following configuration is recommended for the Intel Haswell platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3-auto \
             --enable-mkl             \
             CXX=icpc MPICXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ```
        export I_MPI_PIN=1
 ```
 This is the default.
 ### Build setup for Intel Skylake Xeon platform
 The following configuration is recommended for the Intel Skylake platform:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=mpiicpc
 ```
 The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX512     \
             --enable-comms=mpi3      \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
 Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 ``` 
        export I_MPI_PIN=1
 ```
 This is the default. 
 #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 TBA
 ### Build setup for AMD EPYC / RYZEN
 The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
 are common. Each chip within the module exposes a separate NUMA domain.
 There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
 MPI-3 is recommended with the use of four ranks per socket,
 and 8 threads per rank. 
 The following configuration is recommended for the AMD EPYC platform.
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=AVX2       \
             --enable-comms=mpi3 \
             CXX=mpicxx 
 ```
 If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 ``` bash
               --with-gmp=<path>        \
               --with-mpfr=<path>       \
 ```
 where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
 This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
 It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
 shared memory to communicate within this node:
 mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
 Where omp_bind.sh does the following:
 ```
 #!/bin/bash
 numanode=` expr $PMI_RANK % 8 `
 basecore=`expr $numanode \* 16`
 core0=`expr $basecore + 0 `
 core1=`expr $basecore + 2 `
 core2=`expr $basecore + 4 `
 core3=`expr $basecore + 6 `
 core4=`expr $basecore + 8 `
 core5=`expr $basecore + 10 `
 core6=`expr $basecore + 12 `
 core7=`expr $basecore + 14 `
 export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
 echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 $@
 ```
 Performance:
 #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 TBA
 ### Build setup for BlueGene/Q
 To be written...
 ### Build setup for ARM Neon
 To be written...
 ### Build setup for laptops, other compilers, non-cluster builds
 Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
 and omit the enable-mkl flag. 
 Single node builds are enabled with 
 ```
            --enable-comms=none
 ```
 FFTW support that is not in the default search path may then enabled with
 ```
    --with-fftw=<installpath>
 ```
 BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
@@ -1,24 +1,32 @@
 TODO:
 ---------------
-Peter's work list:
+Large item work list:
 1)- Precision conversion and sort out localConvert      <-- 
 2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
+1)- BG/Q port and check
-- Physical propagator interface
+2)- Christoph's local basis expansion Lanczos
-- Conserved currents
+3)- Precision conversion and sort out localConvert      <-- partial
-- GaugeFix into central location
+
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
+  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
-- HDCR resume
+4)- Physical propagator interface
 5)- Conserved currents
 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 7)- HDCR resume
 Recent DONE 
 -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
-- Merge high precision reduction into develop        
+-- Merge high precision reduction into develop         <-- DONE
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- BlockCG, BCGrQ                                      <-- DONE
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
   -- slice* linalg routines for multiRHS, BlockCG    
 -----
@@ -0,0 +1,797 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 struct time_statistics{
  double mean;
  double err;
  double min;
  double max;
  void statistics(std::vector<double> v){
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
      mean = sum / v.size();
      std::vector<double> diff(v.size());
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
      auto result = std::minmax_element(v.begin(), v.end());
      min = *result.first;
      max = *result.second;
 }
 };
 void comms_header(){
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 };
 Gamma::Algebra Gmu [] = {
  Gamma::Algebra::GammaX,
  Gamma::Algebra::GammaY,
  Gamma::Algebra::GammaZ,
  Gamma::Algebra::GammaT
 };
 struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
  //  int HugePages;
 };
 class Benchmark {
 public:
  static void Decomposition (void ) {
    int threads = GridThread::GetThreads();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  static void Comms(void)
  {
    int Nloop=200;
    int nmu=0;
    int maxlat=32;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();
    for(int lat=4;lat<=maxlat;lat+=4){
      for(int Ls=8;Ls<=8;Ls*=2){
 	std::vector<int> latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
 	RealD ppn = Nrank/Nnode;
 	std::vector<HalfSpinColourVectorD *> xbuf(8);
 	std::vector<HalfSpinColourVectorD *> rbuf(8);
 	Grid.ShmBufferFreeAll();
 	for(int d=0;d<8;d++){
 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	int ncomm;
 	double dbytes;
 	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
 	  double start=usecond();
 	  dbytes=0;
 	  ncomm=0;
 	  parallel_for(int dir=0;dir<8;dir++){
 	    double tbytes;
 	    int mu =dir % 4;
 	    if (mpi_layout[mu]>1 ) {
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
 		int comm_proc=1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      } else { 
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 						 (void *)&rbuf[dir][0], recv_from_rank,
 						 bytes,dir);
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      ncomm++;
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      dbytes+=tbytes;
 	    }
 	  }
 	  Grid.Barrier();
 	  double stop=usecond();
 	  t_time[i] = stop-start; // microseconds
 	}
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
 	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    }
    }    
    return;
  }
  static void Memory(void)
  {
    const int Nvec=8;
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
    typedef iVector<vReal,Nvec> Vec;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
    uint64_t NP;
    uint64_t NN;
  uint64_t lmax=48;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      NP= Grid.RankCount();
      NN =Grid.NodeCount();
      Vec rn ; random(sRNG,rn);
      LatticeVec z(&Grid); z=rn;
      LatticeVec x(&Grid); x=rn;
      LatticeVec y(&Grid); y=rn;
      double a=2.0;
      uint64_t Nloop=NLOOP;
      double start=usecond();
      for(int i=0;i<Nloop;i++){
 	z=a*x-y;
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
        y._odata[4]=z._odata[4];
      }
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) 
 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 	       << "\t\t"<< bytes/time/NN <<std::endl;
    }
  };
  static double DWF5(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (sFGrid); random(RNG5,src);
    LatticeFermion tmp   (sFGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    LatticeFermion r_eo  (sFGrid);
    LatticeFermion err   (sFGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 100;
 	uint64_t ncall = 1000;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	sFGrid->Barrier();
 	double t1=usecond();
 	sDw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	sFGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 	sDw.Report();
      }
      double robust = mflops_worst/mflops_best;;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
  static double DWF(int Ls,int L, double & robust)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (FGrid); random(RNG5,src);
    LatticeFermion ref   (FGrid);
    LatticeFermion tmp   (FGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    ////////////////////////////////////
    // Naive wilson implementation
    ////////////////////////////////////
    {
      LatticeGaugeField Umu5d(FGrid); 
      std::vector<LatticeColourMatrix> U(4,FGrid);
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 	}
      }
      ref = zero;
      for(int mu=0;mu<Nd;mu++){
 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
      }
      for(int mu=0;mu<Nd;mu++){
 	tmp = U[mu]*Cshift(src,mu+1,1);
 	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 	tmp =adj(U[mu])*src;
 	tmp =Cshift(tmp,mu+1,-1);
 	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      }
      ref = -0.5*ref;
    }
    LatticeFermion src_e (FrbGrid);
    LatticeFermion src_o (FrbGrid);
    LatticeFermion r_e   (FrbGrid);
    LatticeFermion r_o   (FrbGrid);
    LatticeFermion r_eo  (FGrid);
    LatticeFermion err   (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 200;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 1000;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	Dw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Dw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	Dw.Report();
 	Dw.DhopEO(src_o,r_e,DaggerNo);
 	Dw.DhopOE(src_e,r_o,DaggerNo);
 	setCheckerboard(r_eo,r_o);
 	setCheckerboard(r_eo,r_e);
 	err = r_eo-ref; 
 	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 	assert((norm2(err)<1.0e-4));
      }
      robust = mflops_worst/mflops_best;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 #endif
  Benchmark::Decomposition();
  int do_memory=1;
  int do_comms =1;
  int do_su3   =0;
  int do_wilson=1;
  int do_dwf   =1;
  if ( do_su3 ) {
    // empty for now
  }
  int sel=2;
  std::vector<int> L_list({8,12,16,24});
  //int sel=1;
  //  std::vector<int> L_list({8,12});
  std::vector<double> robust_list;
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> dwf5;
  if ( do_wilson ) {
    int Ls=1;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      double robust;
      wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
    }
  }
  int Ls=16;
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      double robust;
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
      dwf4.push_back(result);
      robust_list.push_back(robust);
    }
  }
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
    }
  }
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  int NN=NN_global;
  if ( do_memory ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Memory();
  }
  if ( do_comms && (NN>1) ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::Comms();
  }
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Comparison point     result: "  << dwf4[sel]/NN << " Mflop/s per node"<<std::endl;
  std::cout<<std::setprecision(3);
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  Grid_finalize();
 }
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
  int Nloop=100;
  int nmu=0;
-  int maxlat=24;
+  int maxlat=32;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      for(int i=0;i<Nloop;i++){
      double start=usecond();
@@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@@ -163,7 +167,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -249,7 +258,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -299,7 +308,7 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
@@ -310,11 +319,11 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
 	  }
 	}
-	Grid.StencilSendToRecvFromComplete(requests);
+	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
@@ -346,7 +355,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -393,8 +402,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
@@ -406,8 +415,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
@@ -436,5 +445,97 @@ int main (int argc, char ** argv)
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	parallel_for(int dir=0;dir<8;dir++){
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    if ( dir == mu ) { 
 	      int comm_proc=1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    } else { 
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 #pragma omp atomic
 	    dbytes+=tbytes;
 	  }
 	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  Grid_finalize();
 }
@@ -165,7 +165,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
+  int ncall =500;
  if (1) {
    FGrid->Barrier();
    Dw.ZeroCounters();
@@ -303,6 +303,7 @@ int main (int argc, char ** argv)
    }
    assert(sum < 1.0e-4);
    if(1){
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
@@ -381,8 +382,23 @@ int main (int argc, char ** argv)
      }
      assert(error<1.0e-4);
    }
  if(0){
    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      sDw.Dhop(ssrc,sresult,0);
      PerformanceCounter Counter(i);
      Counter.Start();
      sDw.Dhop(ssrc,sresult,0);
      Counter.Stop();
      Counter.Report();
    }
  }
  }
  if (1)
  { // Naive wilson dag implementation
    ref = zero;
@@ -487,9 +503,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  //assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_e)<1.0e-4);
-  //assert(norm2(src_o)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
@@ -55,21 +55,21 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=64;
+  uint64_t lmax=96;
-#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
+#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      uint64_t Nloop=NLOOP;
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      double a=2.0;
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@@ -94,17 +94,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      double a=2.0;
      uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
      double time = (stop-start)/Nloop*1000;
      double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
    }
@@ -129,20 +129,20 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      RealD a=2.0;
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=2*vol*Nvec*sizeof(Real);
+      double bytes=2.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*1;// mul
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
@@ -166,17 +166,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  for(int lat=4;lat<=lmax;lat+=4){
+  for(int lat=8;lat<=lmax;lat+=8){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      uint64_t Nloop=NLOOP;
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); //random(pRNG,z);
+      LatticeVec z(&Grid);// random(pRNG,z);
-      LatticeVec x(&Grid); //random(pRNG,x);
+      LatticeVec x(&Grid);// random(pRNG,x);
-      LatticeVec y(&Grid); //random(pRNG,y);
+      LatticeVec y(&Grid);// random(pRNG,y);
      RealD a=2.0;
      Real nn;      
      double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
      double stop=usecond();
      double time = (stop-start)/Nloop*1000;
-      double bytes=vol*Nvec*sizeof(Real);
+      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
@@ -37,12 +37,12 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
 #define LMAX (64)
-  int Nloop=20;
+  int64_t Nloop=20;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
-  int threads = GridThread::GetThreads();
+  int64_t threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -54,16 +54,16 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid);// random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid);// random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid);// random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	x=x*y;
      }
      double stop=usecond();
@@ -86,17 +86,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	z=x*y;
      }
      double stop=usecond();
@@ -117,17 +117,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mult(z,x,y);
      }
      double stop=usecond();
@@ -148,17 +148,17 @@ int main (int argc, char ** argv)
  for(int lat=2;lat<=LMAX;lat+=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeColourMatrix z(&Grid); //random(pRNG,z);
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); //random(pRNG,x);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); //random(pRNG,y);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
      double start=usecond();
-      for(int i=0;i<Nloop;i++){
+      for(int64_t i=0;i<Nloop;i++){
 	mac(z,x,y);
      }
      double stop=usecond();
@@ -1,4 +1,4 @@
-]#!/usr/bin/env bash
+#!/usr/bin/env bash
 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'
@@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ################ Get git info
 #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 ################ Set flags
 # do not move!
 CXXFLAGS="-O3 $CXXFLAGS"
 ############### Checks for programs
 AC_PROG_CXX
 AC_PROG_RANLIB
@@ -27,7 +31,6 @@ AX_GXX_VERSION
 AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
      [version of g++ that will compile the code])
 CXXFLAGS="-g $CXXFLAGS"
 ############### Checks for typedefs, structures, and compiler characteristics
@@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 ############## Standard libraries
 AC_CHECK_LIB([m],[cos])
 AC_CHECK_LIB([stdc++],[abort])
 ############### GMP and MPFR
 AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
@@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 AC_SEARCH_LIBS([crc32], [z],
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
-               [have_zlib=true],
+               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 AC_SEARCH_LIBS([move_pages], [numa],
               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
 	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
               [have_hdf5=true]
@@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-march=knl';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      NEONv8)
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
        SIMD_FLAGS='-march=armv8-a';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
@@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='';;
      KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        AC_DEFINE([KNL],[1],[Knights landing processor])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
        AC_DEFINE([GEN],[1],[generic vector code])
@@ -313,8 +331,41 @@ case ${ac_PRECISION} in
     double)
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
     *)
     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
     ;;
 esac
 ######################  Shared memory allocation technique under MPI3
 AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 case ${ac_SHM} in
     shmget)
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
     ;;
     shmopen)
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
     ;;
     hugetlbfs)
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
     ;;
     *)
     AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
     ;;
 esac
 ######################  Shared base path for SHMMMAP
 AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
              [Select SHM mmap base path for hugetlbfs])],
 	      [ac_SHMPATH=${enable_shmpath}],
 	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 ############### communication type selection
 AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
@@ -324,14 +375,14 @@ case ${ac_COMMS} in
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
     mpi3l*)
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
       comms_type='mpi3l'
     ;;
     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpit)
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
        comms_type='mpit'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
@@ -359,7 +410,7 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
@@ -464,6 +515,8 @@ compiler version            : ${ax_cv_gxx_version}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
 Communications type         : ${comms_type}
 Shared memory allocator     : ${ac_SHM}
 Shared memory mmap path     : ${ac_SHMPATH}
 Default precision           : ${ac_PRECISION}
 Software FP16 conversion    : ${ac_SFW_FP16}
 RNG choice                  : ${ac_RNG}
@@ -41,9 +41,10 @@ using namespace Hadrons;
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
-    nd_ = GridDefaultLatt().size();
+    dim_ = GridDefaultLatt();
    nd_  = dim_.size();
    grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
+        dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
        GridDefaultMpi()));
    gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
    auto loc = getGrid()->LocalDimensions();
@@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const
    return nd_;
 }
 std::vector<int> Environment::getDim(void) const
 {
    return dim_;
 }
 int Environment::getDim(const unsigned int mu) const
 {
    return dim_[mu];
 }
 // random number generator /////////////////////////////////////////////////////
 void Environment::setSeed(const std::vector<int> &seed)
 {
@@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const
    return getModuleType(getModuleAddress(name));
 }
 std::string Environment::getModuleNamespace(const unsigned int address) const
 {
    std::string type = getModuleType(address), ns;
    auto pos2 = type.rfind("::");
    auto pos1 = type.rfind("::", pos2 - 2);
    return type.substr(pos1 + 2, pos2 - pos1 - 2);
 }
 std::string Environment::getModuleNamespace(const std::string name) const
 {
    return getModuleNamespace(getModuleAddress(name));
 }
 bool Environment::hasModule(const unsigned int address) const
 {
    return (address < module_.size());
@@ -492,7 +518,14 @@ std::string Environment::getObjectType(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
    {
-        return typeName(object_[address].type);
+        if (object_[address].type)
        {
            return typeName(object_[address].type);
        }
        else
        {
            return "<no type>";
        }
    }
    else if (hasObject(address))
    {
@@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const
    return getObjectSize(getObjectAddress(name));
 }
 unsigned int Environment::getObjectModule(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].module;
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 unsigned int Environment::getObjectModule(const std::string name) const
 {
    return getObjectModule(getObjectAddress(name));
 }
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
@@ -106,6 +106,8 @@ public:
    void                    createGrid(const unsigned int Ls);
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
    // random number generator
    void                    setSeed(const std::vector<int> &seed);
@@ -131,6 +133,8 @@ public:
    std::string             getModuleName(const unsigned int address) const;
    std::string             getModuleType(const unsigned int address) const;
    std::string             getModuleType(const std::string name) const;
    std::string             getModuleNamespace(const unsigned int address) const;
    std::string             getModuleNamespace(const std::string name) const;
    bool                    hasModule(const unsigned int address) const;
    bool                    hasModule(const std::string name) const;
    Graph<unsigned int>     makeModuleGraph(void) const;
@@ -171,6 +175,8 @@ public:
    std::string             getObjectType(const std::string name) const;
    Size                    getObjectSize(const unsigned int address) const;
    Size                    getObjectSize(const std::string name) const;
    unsigned int            getObjectModule(const unsigned int address) const;
    unsigned int            getObjectModule(const std::string name) const;
    unsigned int            getObjectLs(const unsigned int address) const;
    unsigned int            getObjectLs(const std::string name) const;
    bool                    hasObject(const unsigned int address) const;
@@ -181,6 +187,10 @@ public:
    bool                    hasCreatedObject(const std::string name) const;
    bool                    isObject5d(const unsigned int address) const;
    bool                    isObject5d(const std::string name) const;
    template <typename T>
    bool                    isObjectOfType(const unsigned int address) const;
    template <typename T>
    bool                    isObjectOfType(const std::string name) const;
    Environment::Size       getTotalSize(void) const;
    void                    addOwnership(const unsigned int owner,
                                         const unsigned int property);
@@ -197,6 +207,7 @@ private:
    bool                                   dryRun_{false};
    unsigned int                           traj_, locVol_;
    // grids
    std::vector<int>                       dim_;
    GridPt                                 grid4d_;
    std::map<unsigned int, GridPt>         grid5d_;
    GridRbPt                               gridRb4d_;
@@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const
        else
        {
            HADRON_ERROR("object with address " + std::to_string(address) +
-                         " does not have type '" + typeid(T).name() +
+                         " does not have type '" + typeName(&typeid(T)) +
                         "' (has type '" + getObjectType(address) + "')");
        }
    }
@@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name)
    return createLattice<T>(getObjectAddress(name));
 }
 template <typename T>
 bool Environment::isObjectOfType(const unsigned int address) const
 {
    if (hasRegisteredObject(address))
    {
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else if (hasObject(address))
    {
        HADRON_ERROR("object with address " + std::to_string(address) +
                     " exists but is not registered");
    }
    else
    {
        HADRON_ERROR("no object with address " + std::to_string(address));
    }
 }
 template <typename T>
 bool Environment::isObjectOfType(const std::string name) const
 {
    return isObjectOfType<T>(getObjectAddress(name));
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Environment_hpp_
@@ -51,23 +51,43 @@ using Grid::operator<<;
 * error with GCC 5 (clang & GCC 6 compile fine without it).
 */
 // FIXME: find a way to do that in a more general fashion
 #ifndef FIMPL
 #define FIMPL WilsonImplR
 #endif
 #ifndef SIMPL
 #define SIMPL ScalarImplCR
 #endif
 BEGIN_HADRONS_NAMESPACE
 // type aliases
-#define TYPE_ALIASES(FImpl, suffix)\
+#define FERM_TYPE_ALIASES(FImpl, suffix)\
 typedef FermionOperator<FImpl>                       FMat##suffix;             \
 typedef typename FImpl::FermionField                 FermionField##suffix;     \
 typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \
 typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
-typedef typename FImpl::DoubledGaugeField            DoubledGaugeField##suffix;\
+typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \
-typedef std::function<void(FermionField##suffix &,                             \
+                                                     SlicedPropagator##suffix;
 #define GAUGE_TYPE_ALIASES(FImpl, suffix)\
 typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
 #define SCALAR_TYPE_ALIASES(SImpl, suffix)\
 typedef typename SImpl::Field ScalarField##suffix;\
 typedef typename SImpl::Field PropagatorField##suffix;
 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef std::function<void(FermionField##suffix &,\
                      const FermionField##suffix &)> SolverFn##suffix;
 #define SINK_TYPE_ALIASES(suffix)\
 typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
 #define FGS_TYPE_ALIASES(FImpl, suffix)\
 FERM_TYPE_ALIASES(FImpl, suffix)\
 GAUGE_TYPE_ALIASES(FImpl, suffix)\
 SOLVER_TYPE_ALIASES(FImpl, suffix)
 // logger
 class HadronsLogger: public Logger
 {
@@ -1,31 +1,3 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 #include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
@@ -36,13 +8,18 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 #include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Grid/Hadrons/Modules/MSink/Point.hpp>
 #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Grid/Hadrons/Modules/MSource/Point.hpp>
 #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 #include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 #include <Grid/Hadrons/Modules/Quark.hpp>
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_DWF_hpp_
+#ifndef Hadrons_MAction_DWF_hpp_
-#define Hadrons_DWF_hpp_
+#define Hadrons_MAction_DWF_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -48,14 +48,15 @@ public:
                                    std::string, gauge,
                                    unsigned int, Ls,
                                    double      , mass,
-                                    double      , M5);
+                                    double      , M5,
                                    std::string , boundary);
 };
 template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TDWF(const std::string name);
@@ -116,14 +117,19 @@ void TDWF<FImpl>::execute(void)
                 << par().mass << ", M5= " << par().M5 << " and Ls= "
                 << par().Ls << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    env().createGrid(par().Ls);
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &g4     = *env().getGrid();
    auto &grb4   = *env().getRbGrid();
    auto &g5     = *env().getGrid(par().Ls);
    auto &grb5   = *env().getRbGrid(par().Ls);
    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new DomainWallFermion<FImpl>(U, g5, grb5, g4, grb4,
-                                                par().mass, par().M5);
+                                                par().mass, par().M5,
                                                implParams);
    env().setObject(getName(), fMatPt);
 }
@@ -131,4 +137,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_DWF_hpp_
+#endif // Hadrons_MAction_DWF_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Wilson_hpp_
+#ifndef Hadrons_MAction_Wilson_hpp_
-#define Hadrons_Wilson_hpp_
+#define Hadrons_MAction_Wilson_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -46,14 +46,15 @@ class WilsonPar: Serializable
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                    std::string, gauge,
-                                    double     , mass);
+                                    double     , mass,
                                    std::string, boundary);
 };
 template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilson(const std::string name);
@@ -112,10 +113,15 @@ void TWilson<FImpl>::execute()
 {
    LOG(Message) << "Setting up TWilson fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
                 << std::endl;
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
    auto &grid   = *env().getGrid();
    auto &gridRb = *env().getRbGrid();
-    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass);
+    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
    FMat *fMatPt = new WilsonFermion<FImpl>(U, grid, gridRb, par().mass,
                                            implParams);
    env().setObject(getName(), fMatPt);
 }
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Baryon_hpp_
+#ifndef Hadrons_MContraction_Baryon_hpp_
-#define Hadrons_Baryon_hpp_
+#define Hadrons_MContraction_Baryon_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TBaryon: public Module<BaryonPar>
 {
 public:
-    TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl2, 2);
-    TYPE_ALIASES(FImpl3, 3);
+    FERM_TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
@@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
    // FIXME: do contractions
-    write(writer, "meson", result);
+    // write(writer, "meson", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Baryon_hpp_
+#endif // Hadrons_MContraction_Baryon_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_DiscLoop_hpp_
+#ifndef Hadrons_MContraction_DiscLoop_hpp_
-#define Hadrons_DiscLoop_hpp_
+#define Hadrons_MContraction_DiscLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -52,7 +52,7 @@ public:
 template <typename FImpl>
 class TDiscLoop: public Module<DiscLoopPar>
 {
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
    class Result: Serializable
    {
    public:
@@ -141,4 +141,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_DiscLoop_hpp_
+#endif // Hadrons_MContraction_DiscLoop_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Gamma3pt_hpp_
+#ifndef Hadrons_MContraction_Gamma3pt_hpp_
-#define Hadrons_Gamma3pt_hpp_
+#define Hadrons_MContraction_Gamma3pt_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -72,9 +72,9 @@ public:
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 class TGamma3pt: public Module<Gamma3ptPar>
 {
-    TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl2, 2);
-    TYPE_ALIASES(FImpl3, 3);
+    FERM_TYPE_ALIASES(FImpl3, 3);
    class Result: Serializable
    {
    public:
@@ -167,4 +167,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Gamma3pt_hpp_
+#endif // Hadrons_MContraction_Gamma3pt_hpp_
@@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Meson_hpp_
+#ifndef Hadrons_MContraction_Meson_hpp_
-#define Hadrons_Meson_hpp_
+#define Hadrons_MContraction_Meson_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -69,7 +69,7 @@ public:
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, gammas,
-                                    std::string, mom,
+                                    std::string, sink,
                                    std::string, output);
 };
@@ -77,8 +77,10 @@ template <typename FImpl1, typename FImpl2>
 class TMeson: public Module<MesonPar>
 {
 public:
-    TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl1, 1);
-    TYPE_ALIASES(FImpl2, 2);
+    FERM_TYPE_ALIASES(FImpl2, 2);
    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
    SINK_TYPE_ALIASES(Scalar);
    class Result: Serializable
    {
    public:
@@ -115,7 +117,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name)
 template <typename FImpl1, typename FImpl2>
 std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
 {
-    std::vector<std::string> input = {par().q1, par().q2};
+    std::vector<std::string> input = {par().q1, par().q2, par().sink};
    return input;
 }
@@ -131,12 +133,11 @@ std::vector<std::string> TMeson<FImpl1, FImpl2>::getOutput(void)
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 {
    gammaList.clear();
    // Determine gamma matrices to insert at source/sink.
    if (par().gammas.compare("all") == 0)
    {
        // Do all contractions.
        unsigned int n_gam = Ns * Ns;
        gammaList.resize(n_gam*n_gam);
        for (unsigned int i = 1; i < Gamma::nGamma; i += 2)
        {
            for (unsigned int j = 1; j < Gamma::nGamma; j += 2)
@@ -155,6 +156,9 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 // execution ///////////////////////////////////////////////////////////////////
 #define mesonConnected(q1, q2, gSnk, gSrc) \
 (g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
 template <typename FImpl1, typename FImpl2>
 void TMeson<FImpl1, FImpl2>::execute(void)
 {
@@ -162,43 +166,72 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                 << std::endl;
-    CorrWriter              writer(par().output);
+    CorrWriter             writer(par().output);
    PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1);
    PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2);
    LatticeComplex         c(env().getGrid());
    Gamma                  g5(Gamma::Algebra::Gamma5);
    std::vector<GammaPair> gammaList;
    std::vector<TComplex>  buf;
    std::vector<Result>    result;
-    std::vector<Real>      p;
+    Gamma                  g5(Gamma::Algebra::Gamma5);
-
+    std::vector<GammaPair> gammaList;
-    p  = strToVec<Real>(par().mom);
+    int                    nt = env().getDim(Tp);
    LatticeComplex         ph(env().getGrid()), coor(env().getGrid());
    Complex                i(0.0,1.0);
    ph = zero;
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    parseGammaString(gammaList);
    result.resize(gammaList.size());
    for (unsigned int i = 0; i < result.size(); ++i)
    {
        Gamma gSnk(gammaList[i].first);
        Gamma gSrc(gammaList[i].second);
        c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
        sliceSum(c, buf, Tp);
        result[i].gamma_snk = gammaList[i].first;
        result[i].gamma_src = gammaList[i].second;
-        result[i].corr.resize(buf.size());
+        result[i].corr.resize(nt);
-        for (unsigned int t = 0; t < buf.size(); ++t)
+    }
    if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
        env().template isObjectOfType<SlicedPropagator2>(par().q2))
    {
        SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
        SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
        LOG(Message) << "(propagator already sinked)" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
-            result[i].corr[t] = TensorRemove(buf[t]);
+            Gamma gSnk(gammaList[i].first);
            Gamma gSrc(gammaList[i].second);
            for (unsigned int t = 0; t < buf.size(); ++t)
            {
                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
            }
        }
    }
    else
    {
        PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1);
        PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2);
        LatticeComplex   c(env().getGrid());
        LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
        for (unsigned int i = 0; i < result.size(); ++i)
        {
            Gamma       gSnk(gammaList[i].first);
            Gamma       gSrc(gammaList[i].second);
            std::string ns;
            ns = env().getModuleNamespace(env().getObjectModule(par().sink));
            if (ns == "MSource")
            {
                PropagatorField1 &sink =
                    *env().template getObject<PropagatorField1>(par().sink);
                c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
                sliceSum(c, buf, Tp);
            }
            else if (ns == "MSink")
            {
                SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
                c   = trace(mesonConnected(q1, q2, gSnk, gSrc));
                buf = sink(c);
            }
            for (unsigned int t = 0; t < buf.size(); ++t)
            {
                result[i].corr[t] = TensorRemove(buf[t]);
            }
        }
    }
    write(writer, "meson", result);
@@ -208,4 +241,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Meson_hpp_
+#endif // Hadrons_MContraction_Meson_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakHamiltonian_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
-#define Hadrons_WeakHamiltonian_hpp_
+#define Hadrons_MContraction_WeakHamiltonian_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -83,7 +83,7 @@ public:
 class T##modname: public Module<WeakHamiltonianPar>\
 {\
 public:\
-    TYPE_ALIASES(FIMPL,)\
+    FERM_TYPE_ALIASES(FIMPL,)\
    class Result: Serializable\
    {\
    public:\
@@ -111,4 +111,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakHamiltonian_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonian_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakHamiltonianEye_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
-#define Hadrons_WeakHamiltonianEye_hpp_
+#define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@@ -55,4 +55,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakHamiltonianEye_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakHamiltonianNonEye_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
-#define Hadrons_WeakHamiltonianNonEye_hpp_
+#define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@@ -54,4 +54,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakHamiltonianNonEye_hpp_
+#endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WeakNeutral4ptDisc_hpp_
+#ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
-#define Hadrons_WeakNeutral4ptDisc_hpp_
+#define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@@ -56,4 +56,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WeakNeutral4ptDisc_hpp_
+#endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
@@ -1,34 +1,5 @@
-/*************************************************************************************
+#ifndef Hadrons_MFermion_GaugeProp_hpp_
-
+#define Hadrons_MFermion_GaugeProp_hpp_
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/Quark.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Quark_hpp_
 #define Hadrons_Quark_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
- *                               TQuark                                       *
+ *                                GaugeProp                                   *
 ******************************************************************************/
-class QuarkPar: Serializable
+BEGIN_MODULE_NAMESPACE(MFermion)
 class GaugePropPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
                                    std::string, source,
                                    std::string, solver);
 };
 template <typename FImpl>
-class TQuark: public Module<QuarkPar>
+class TGaugeProp: public Module<GaugePropPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
-    TQuark(const std::string name);
+    TGaugeProp(const std::string name);
    // destructor
-    virtual ~TQuark(void) = default;
+    virtual ~TGaugeProp(void) = default;
-    // dependencies/products
+    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
@@ -69,20 +42,20 @@ private:
    SolverFn     *solver_{nullptr};
 };
-MODULE_REGISTER(Quark, TQuark<FIMPL>);
+MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 /******************************************************************************
- *                          TQuark implementation                             *
+ *                      TGaugeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TQuark<FImpl>::TQuark(const std::string name)
+TGaugeProp<FImpl>::TGaugeProp(const std::string name)
-: Module(name)
+: Module<GaugePropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getInput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().solver};
@@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void)
 }
 template <typename FImpl>
-std::vector<std::string> TQuark<FImpl>::getOutput(void)
+std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName(), getName() + "_5d"};
@@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::setup(void)
+void TGaugeProp<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().solver);
    env().template registerLattice<PropagatorField>(getName());
@@ -111,13 +84,13 @@ void TQuark<FImpl>::setup(void)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TQuark<FImpl>::execute(void)
+void TGaugeProp<FImpl>::execute(void)
 {
    LOG(Message) << "Computing quark propagator '" << getName() << "'"
-                 << std::endl;
+    << std::endl;
    FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
-                    tmp(env().getGrid());
+    tmp(env().getGrid());
    std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
    PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
@@ -128,7 +101,7 @@ void TQuark<FImpl>::execute(void)
    }
    LOG(Message) << "Inverting using solver '" << par().solver
-                 << "' on source '" << par().source << "'" << std::endl;
+    << "' on source '" << par().source << "'" << std::endl;
    for (unsigned int s = 0; s < Ns; ++s)
    for (unsigned int c = 0; c < Nc; ++c)
    {
@@ -170,7 +143,7 @@ void TQuark<FImpl>::execute(void)
        if (Ls_ > 1)
        {
            PropagatorField &p4d =
-                *env().template getObject<PropagatorField>(getName());
+            *env().template getObject<PropagatorField>(getName());
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
@@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void)
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Quark_hpp_
+#endif // Hadrons_MFermion_GaugeProp_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Load_hpp_
+#ifndef Hadrons_MGauge_Load_hpp_
-#define Hadrons_Load_hpp_
+#define Hadrons_MGauge_Load_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -70,4 +70,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Load_hpp_
+#endif // Hadrons_MGauge_Load_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Random_hpp_
+#ifndef Hadrons_MGauge_Random_hpp_
-#define Hadrons_Random_hpp_
+#define Hadrons_MGauge_Random_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Random_hpp_
+#endif // Hadrons_MGauge_Random_hpp_
@@ -0,0 +1,88 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 /******************************************************************************
 *                  TStochEm implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TStochEm::TStochEm(const std::string name)
 : Module<StochEmPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TStochEm::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 std::vector<std::string> TStochEm::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TStochEm::setup(void)
 {
    if (!env().hasRegisteredObject("_" + getName() + "_weight"))
    {
        env().registerLattice<EmComp>("_" + getName() + "_weight");
    }
    env().registerLattice<EmField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TStochEm::execute(void)
 {
    PhotonR photon(par().gauge, par().zmScheme);
    EmField &a = *env().createLattice<EmField>(getName());
    EmComp  *w;
    if (!env().hasCreatedObject("_" + getName() + "_weight"))
    {
        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
                     << par().gauge << ", zero-mode scheme: "
                     << par().zmScheme << ")..." << std::endl;
        w = env().createLattice<EmComp>("_" + getName() + "_weight");
        photon.StochasticWeight(*w);
    }
    else
    {
        w = env().getObject<EmComp>("_" + getName() + "_weight");
    }
    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
    photon.StochasticField(a, *env().get4dRng(), *w);
 }
@@ -0,0 +1,75 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
 Copyright (C) 2015
 Copyright (C) 2016
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MGauge_StochEm_hpp_
 #define Hadrons_MGauge_StochEm_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         StochEm                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 class StochEmPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                    PhotonR::Gauge,    gauge,
                                    PhotonR::ZmScheme, zmScheme);
 };
 class TStochEm: public Module<StochEmPar>
 {
 public:
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TStochEm(const std::string name);
    // destructor
    virtual ~TStochEm(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MGauge_StochEm_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Unit_hpp_
+#ifndef Hadrons_MGauge_Unit_hpp_
-#define Hadrons_Unit_hpp_
+#define Hadrons_MGauge_Unit_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Unit_hpp_
+#endif // Hadrons_MGauge_Unit_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_NoiseLoop_hpp_
+#ifndef Hadrons_MLoop_NoiseLoop_hpp_
-#define Hadrons_NoiseLoop_hpp_
+#define Hadrons_MLoop_NoiseLoop_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -65,7 +65,7 @@ template <typename FImpl>
 class TNoiseLoop: public Module<NoiseLoopPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TNoiseLoop(const std::string name);
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_NoiseLoop_hpp_
+#endif // Hadrons_MLoop_NoiseLoop_hpp_
@@ -0,0 +1,226 @@
 #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                     TChargedProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TChargedProp::TChargedProp(const std::string name)
 : Module<ChargedPropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TChargedProp::getInput(void)
 {
    std::vector<std::string> in = {par().source, par().emField};
    return in;
 }
 std::vector<std::string> TChargedProp::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TChargedProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
    phaseName_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
    GFSrcName_ = "_" + getName() + "_DinvSrc";
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
    }
    if (!env().hasRegisteredObject(phaseName_[0]))
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            env().registerLattice<ScalarField>(phaseName_[mu]);
        }
    }
    if (!env().hasRegisteredObject(GFSrcName_))
    {
        env().registerLattice<ScalarField>(GFSrcName_);
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TChargedProp::execute(void)
 {
    // CACHING ANALYTIC EXPRESSIONS
    ScalarField &source = *env().getObject<ScalarField>(par().source);
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    // cache free scalar propagator
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass);
    }
    else
    {
        freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
    }
    // cache G*F*src
    if (!env().hasCreatedObject(GFSrcName_))
    {
        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
    }
    else
    {
        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
    }
    // cache phases
    if (!env().hasCreatedObject(phaseName_[0]))
    {
        std::vector<int> &l = env().getGrid()->_fdimensions;
        LOG(Message) << "Caching shift phases..." << std::endl;
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            Real    twoPiL = M_PI*2./l[mu];
            phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
            LatticeCoordinate(*(phase_[mu]), mu);
            *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
        }
    }
    else
    {
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
        }
    }
    // PROPAGATOR CALCULATION
    LOG(Message) << "Computing charged scalar propagator"
                 << " (mass= " << par().mass
                 << ", charge= " << par().charge << ")..." << std::endl;
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField buf(env().getGrid());
    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
    double      q = par().charge;
    // G*F*Src
    prop = GFSrc;
    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
    buf = GFSrc;
    momD1(buf, fft);
    buf = G*buf;
    prop = prop - q*buf;
    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
    momD1(buf, fft);
    prop = prop + q*q*G*buf;
    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
    buf = GFSrc;
    momD2(buf, fft);
    prop = prop - q*q*G*buf;
    // final FT
    fft.FFT_all_dim(prop, prop, FFT::backward);
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        std::string           filename = par().output + "." +
                                         std::to_string(env().getTrajectory());
        LOG(Message) << "Saving zero-momentum projection to '"
                     << filename << "'..." << std::endl;
        CorrWriter            writer(filename);
        std::vector<TComplex> vecBuf;
        std::vector<Complex>  result;
        sliceSum(prop, vecBuf, Tp);
        result.resize(vecBuf.size());
        for (unsigned int t = 0; t < vecBuf.size(); ++t)
        {
            result[t] = TensorRemove(vecBuf[t]);
        }
        write(writer, "charge", q);
        write(writer, "prop", result);
    }
 }
 void TChargedProp::momD1(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    Complex     ci(0.0,1.0);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result - ci*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + ci*adj(*phase_[mu])*buf;
    }
    s = result;
 }
 void TChargedProp::momD2(ScalarField &s, FFT &fft)
 {
    EmField     &A = *env().getObject<EmField>(par().emField);
    ScalarField buf(env().getGrid()), result(env().getGrid()),
                Amu(env().getGrid());
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + .5*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);        
        buf = Amu*Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + .5*adj(*phase_[mu])*buf;
    }
    s = result;
 }
@@ -0,0 +1,61 @@
 #ifndef Hadrons_MScalar_ChargedProp_hpp_
 #define Hadrons_MScalar_ChargedProp_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                       Charged scalar propagator                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class ChargedPropPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
                                    std::string, emField,
                                    std::string, source,
                                    double,      mass,
                                    double,      charge,
                                    std::string, output);
 };
 class TChargedProp: public Module<ChargedPropPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
 public:
    // constructor
    TChargedProp(const std::string name);
    // destructor
    virtual ~TChargedProp(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void momD1(ScalarField &s, FFT &fft);
    void momD2(ScalarField &s, FFT &fft);
 private:
    std::string                freeMomPropName_, GFSrcName_;
    std::vector<std::string>   phaseName_;
    ScalarField                *freeMomProp_, *GFSrc_;
    std::vector<ScalarField *> phase_;
    EmField                    *A;
 };
 MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_ChargedProp_hpp_
@@ -0,0 +1,79 @@
 #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                        TFreeProp implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TFreeProp::TFreeProp(const std::string name)
 : Module<FreePropPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TFreeProp::getInput(void)
 {
    std::vector<std::string> in = {par().source};
    return in;
 }
 std::vector<std::string> TFreeProp::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TFreeProp::setup(void)
 {
    freeMomPropName_ = FREEMOMPROP(par().mass);
    if (!env().hasRegisteredObject(freeMomPropName_))
    {
        env().registerLattice<ScalarField>(freeMomPropName_);
    }
    env().registerLattice<ScalarField>(getName());
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TFreeProp::execute(void)
 {
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
    ScalarField &source = *env().getObject<ScalarField>(par().source);
    ScalarField *freeMomProp;
    if (!env().hasCreatedObject(freeMomPropName_))
    {
        LOG(Message) << "Caching momentum space free scalar propagator"
                     << " (mass= " << par().mass << ")..." << std::endl;
        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
        SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass);
    }
    else
    {
        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
    }
    LOG(Message) << "Computing free scalar propagator..." << std::endl;
    SIMPL::FreePropagator(source, prop, *freeMomProp);
    if (!par().output.empty())
    {
        TextWriter            writer(par().output + "." +
                                     std::to_string(env().getTrajectory()));
        std::vector<TComplex> buf;
        std::vector<Complex>  result;
        sliceSum(prop, buf, Tp);
        result.resize(buf.size());
        for (unsigned int t = 0; t < buf.size(); ++t)
        {
            result[t] = TensorRemove(buf[t]);
        }
        write(writer, "prop", result);
    }
 }
@@ -0,0 +1,50 @@
 #ifndef Hadrons_MScalar_FreeProp_hpp_
 #define Hadrons_MScalar_FreeProp_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                               FreeProp                                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class FreePropPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
                                    std::string, source,
                                    double,      mass,
                                    std::string, output);
 };
 class TFreeProp: public Module<FreePropPar>
 {
 public:
    SCALAR_TYPE_ALIASES(SIMPL,);
 public:
    // constructor
    TFreeProp(const std::string name);
    // destructor
    virtual ~TFreeProp(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    std::string freeMomPropName_;
 };
 MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_FreeProp_hpp_
@@ -0,0 +1,6 @@
 #ifndef Hadrons_Scalar_hpp_
 #define Hadrons_Scalar_hpp_
 #define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
 #endif // Hadrons_Scalar_hpp_
@@ -0,0 +1,114 @@
 #ifndef Hadrons_MSink_Point_hpp_
 #define Hadrons_MSink_Point_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                                   Point                                    *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MSink)
 class PointPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar,
                                    std::string, mom);
 };
 template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SINK_TYPE_ALIASES();
 public:
    // constructor
    TPoint(const std::string name);
    // destructor
    virtual ~TPoint(void) = default;
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
 MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
 /******************************************************************************
 *                          TPoint implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TPoint<FImpl>::TPoint(const std::string name)
 : Module<PointPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TPoint<FImpl>::getInput(void)
 {
    std::vector<std::string> in;
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TPoint<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::setup(void)
 {
    unsigned int size;
    size = env().template lattice4dSize<LatticeComplex>();
    env().registerObject(getName(), size);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TPoint<FImpl>::execute(void)
 {
    std::vector<Real> p = strToVec<Real>(par().mom);
    LatticeComplex    ph(env().getGrid()), coor(env().getGrid());
    Complex           i(0.0,1.0);
    LOG(Message) << "Setting up point sink function for momentum ["
                 << par().mom << "]" << std::endl;
    ph = zero;
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
    }
    ph = exp((Real)(2*M_PI)*i*ph);
    auto sink = [ph](const PropagatorField &field)
    {
        SlicedPropagator res;
        PropagatorField  tmp = ph*field;
        sliceSum(tmp, res, Tp);
        return res;
    };
    env().setObject(getName(), new SinkFn(sink));
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MSink_Point_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_RBPrecCG_hpp_
+#ifndef Hadrons_MSolver_RBPrecCG_hpp_
-#define Hadrons_RBPrecCG_hpp_
+#define Hadrons_MSolver_RBPrecCG_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -53,7 +53,7 @@ template <typename FImpl>
 class TRBPrecCG: public Module<RBPrecCGPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TRBPrecCG(const std::string name);
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_RBPrecCG_hpp_
+#endif // Hadrons_MSolver_RBPrecCG_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Point_hpp_
+#ifndef Hadrons_MSource_Point_hpp_
-#define Hadrons_Point_hpp_
+#define Hadrons_MSource_Point_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -63,7 +63,7 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TPoint(const std::string name);
@@ -78,7 +78,8 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource);
+MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSource);
 MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource);
 /******************************************************************************
 *                       TPoint template implementation                       *
@@ -132,4 +133,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Point_hpp_
+#endif // Hadrons_MSource_Point_hpp_
@@ -28,8 +28,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_SeqGamma_hpp_
+#ifndef Hadrons_MSource_SeqGamma_hpp_
-#define Hadrons_SeqGamma_hpp_
+#define Hadrons_MSource_SeqGamma_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -72,7 +72,7 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FGS_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TSeqGamma(const std::string name);
@@ -161,4 +161,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_SeqGamma_hpp_
+#endif // Hadrons_MSource_SeqGamma_hpp_
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_WallSource_hpp_
+#ifndef Hadrons_MSource_WallSource_hpp_
-#define Hadrons_WallSource_hpp_
+#define Hadrons_MSource_WallSource_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -64,7 +64,7 @@ template <typename FImpl>
 class TWall: public Module<WallPar>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWall(const std::string name);
@@ -144,4 +144,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_WallSource_hpp_
+#endif // Hadrons_MSource_WallSource_hpp_
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef Hadrons_Z2_hpp_
+#ifndef Hadrons_MSource_Z2_hpp_
-#define Hadrons_Z2_hpp_
+#define Hadrons_MSource_Z2_hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -67,7 +67,7 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZ2(const std::string name);
@@ -82,7 +82,8 @@ public:
    virtual void execute(void);
 };
-MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource);
+MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource);
 MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
 /******************************************************************************
 *                       TZ2 template implementation                          *
@@ -148,4 +149,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons_Z2_hpp_
+#endif // Hadrons_MSource_Z2_hpp_
@@ -1,5 +1,5 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
+#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
+#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -41,4 +41,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons____FILEBASENAME____hpp_
+#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
@@ -1,5 +1,5 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
+#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
+#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 #include <Grid/Hadrons/Global.hpp>
 #include <Grid/Hadrons/Module.hpp>
@@ -82,4 +82,4 @@ END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
-#endif // Hadrons____FILEBASENAME____hpp_
+#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
@@ -4,7 +4,10 @@ modules_cc =\
  Modules/MContraction/WeakNeutral4ptDisc.cc \
  Modules/MGauge/Load.cc \
  Modules/MGauge/Random.cc \
-  Modules/MGauge/Unit.cc
+  Modules/MGauge/StochEm.cc \
  Modules/MGauge/Unit.cc \
  Modules/MScalar/ChargedProp.cc \
  Modules/MScalar/FreeProp.cc
 modules_hpp =\
  Modules/MAction/DWF.hpp \
@@ -17,14 +20,19 @@ modules_hpp =\
  Modules/MContraction/WeakHamiltonianEye.hpp \
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
  Modules/MFermion/GaugeProp.hpp \
  Modules/MGauge/Load.hpp \
  Modules/MGauge/Random.hpp \
  Modules/MGauge/StochEm.hpp \
  Modules/MGauge/Unit.hpp \
  Modules/MLoop/NoiseLoop.hpp \
  Modules/MScalar/ChargedProp.hpp \
  Modules/MScalar/FreeProp.hpp \
  Modules/MScalar/Scalar.hpp \
  Modules/MSink/Point.hpp \
  Modules/MSolver/RBPrecCG.hpp \
  Modules/MSource/Point.hpp \
  Modules/MSource/SeqGamma.hpp \
  Modules/MSource/Wall.hpp \
-  Modules/MSource/Z2.hpp \
+  Modules/MSource/Z2.hpp
  Modules/Quark.hpp
@@ -0,0 +1,11 @@
 #include <qed-fvol/Global.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
 QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
 QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
 QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
 QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");
@@ -0,0 +1,42 @@
 #ifndef QedFVol_Global_hpp_
 #define QedFVol_Global_hpp_
 #include <Grid/Grid.h>
 #define BEGIN_QEDFVOL_NAMESPACE \
 namespace Grid {\
 using namespace QCD;\
 namespace QedFVol {\
 using Grid::operator<<;
 #define END_QEDFVOL_NAMESPACE }}
 /* the 'using Grid::operator<<;' statement prevents a very nasty compilation
 * error with GCC (clang compiles fine without it).
 */
 BEGIN_QEDFVOL_NAMESPACE
 class QedFVolLogger: public Logger
 {
 public:
    QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
                                                  GridLogColours, "BLACK"){};
 };
 #define LOG(channel) std::cout << QedFVolLog##channel
 #define QEDFVOL_ERROR(msg)\
 LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
           << __LINE__ << ")" << std::endl;\
 abort();
 #define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 extern QedFVolLogger QedFVolLogError;
 extern QedFVolLogger QedFVolLogWarning;
 extern QedFVolLogger QedFVolLogMessage;
 extern QedFVolLogger QedFVolLogIterative;
 extern QedFVolLogger QedFVolLogDebug;
 END_QEDFVOL_NAMESPACE
 #endif // QedFVol_Global_hpp_
@@ -0,0 +1,9 @@
 AM_CXXFLAGS += -I$(top_srcdir)/extras
 bin_PROGRAMS = qed-fvol
 qed_fvol_SOURCES =   \
    qed-fvol.cc      \
    Global.cc
 qed_fvol_LDADD   = -lGrid
@@ -0,0 +1,265 @@
 #ifndef QEDFVOL_WILSONLOOPS_H
 #define QEDFVOL_WILSONLOOPS_H
 #include <Global.hpp>
 BEGIN_QEDFVOL_NAMESPACE
 template <class Gimpl> class NewWilsonLoops : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  //////////////////////////////////////////////////
  // directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
                           const int mu, const int nu) {
    // Annoyingly, must use either scope resolution to find dependent base
    // class,
    // or this-> ; there is no "this" in a static method. This forces explicit
    // Gimpl scope
    // resolution throughout the usage in this file, and rather defeats the
    // purpose of deriving
    // from Gimpl.
    plaq = Gimpl::CovShiftBackward(
        U[mu], mu, Gimpl::CovShiftBackward(
                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
  }
  //////////////////////////////////////////////////
  // trace of directed plaquette oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceDirPlaquette(LatticeComplex &plaq,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
    GaugeMat sp(U[0]._grid);
    dirPlaquette(sp, U, mu, nu);
    plaq = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of plaquette
  //////////////////////////////////////////////////
  static void sitePlaquette(LatticeComplex &Plaq,
                            const std::vector<GaugeMat> &U) {
    LatticeComplex sitePlaq(U[0]._grid);
    Plaq = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static Real sumPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Plaq(Umu._grid);
    sitePlaquette(Plaq, U);
    TComplex Tp = sum(Plaq);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static Real avgPlaquette(const GaugeLorentz &Umu) {
    int ndim = Umu._grid->_ndimension;
    Real sumplaq = sumPlaquette(Umu);
    Real vol = Umu._grid->gSites();
    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
                           const int Rmu, const int Rnu,
                           const int mu, const int nu) {
    wl = U[nu];
    for(int i = 0; i < Rnu-1; i++){
      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
    }
    for(int i = 0; i < Rnu; i++){
      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
    }
    for(int i = 0; i < Rmu; i++){
      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
    }
  }
  //////////////////////////////////////////////////
  // trace of Wilson Loop oriented in mu,nu plane
  //////////////////////////////////////////////////
  static void traceWilsonLoop(LatticeComplex &wl,
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
    GaugeMat sp(U[0]._grid);
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of Wilson loop
  //////////////////////////////////////////////////
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over planes of Wilson loop with length R1
  // in the time direction
  //////////////////////////////////////////////////
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    int ndim = U[0]._grid->_ndimension;
    Wl = zero;
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
    }
  }
  //////////////////////////////////////////////////
  // sum Wilson loop over all planes orthogonal to the time direction
  //////////////////////////////////////////////////
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteSpatialWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
 };
 END_QEDFVOL_NAMESPACE
 #endif // QEDFVOL_WILSONLOOPS_H
@@ -0,0 +1,88 @@
 #include <Global.hpp>
 #include <WilsonLoops.h>
 using namespace Grid;
 using namespace QCD;
 using namespace QedFVol;
 typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
 typedef PhotonR::GaugeField             EmField;
 typedef PhotonR::GaugeLinkField         EmComp;
 const int NCONFIGS = 10;
 const int NWILSON = 10;
 int main(int argc, char *argv[])
 {
    // parse command line
    std::string parameterFileName;
    if (argc < 2)
    {
        std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
        std::cerr << std::endl;
        std::exit(EXIT_FAILURE);
    }
    parameterFileName = argv[1];
    // initialization
    Grid_init(&argc, &argv);
    QedFVolLogError.Active(GridLogError.isActive());
    QedFVolLogWarning.Active(GridLogWarning.isActive());
    QedFVolLogMessage.Active(GridLogMessage.isActive());
    QedFVolLogIterative.Active(GridLogIterative.isActive());
    QedFVolLogDebug.Active(GridLogDebug.isActive());
    LOG(Message) << "Grid initialized" << std::endl;
    // QED stuff
    std::vector<int> latt_size   = GridDefaultLatt();
    std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    GridCartesian    grid(latt_size,simd_layout,mpi_layout);
    GridParallelRNG  pRNG(&grid);
    PhotonR          photon(PhotonR::Gauge::feynman,
                            PhotonR::ZmScheme::qedL);
    EmField          a(&grid);
    EmField          expA(&grid);
    Complex imag_unit(0, 1);
    Real wlA;
    std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
    pRNG.SeedRandomDevice();
    LOG(Message) << "Wilson loop calculation beginning" << std::endl;
    for(int ic = 0; ic < NCONFIGS; ic++){
        LOG(Message) << "Configuration " << ic <<std::endl;
        photon.StochasticField(a, pRNG);
        // Exponentiate photon field
        expA = exp(imag_unit*a);
        // Calculate Wilson loops
        for(int iw=1; iw<=NWILSON; iw++){
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
            logWlAvg[iw-1] -= 2*log(wlA);
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
            logWlTime[iw-1] -= 2*log(wlA);
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
            logWlSpace[iw-1] -= 2*log(wlA);
        }
    }
    LOG(Message) << "Wilson loop calculation completed" << std::endl;
    // Calculate Wilson loops
    for(int iw=1; iw<=10; iw++){
        LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
        LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
        LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
        LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
    }
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
    return EXIT_SUCCESS;
 }
@@ -41,6 +41,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/utils/GaugeFix.h>
 #include <Grid/qcd/smearing/Smearing.h>
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif
-if BUILD_COMMS_MPI3L
+if BUILD_COMMS_MPIT
-  extra_sources+=communicator/Communicator_mpi3_leader.cc
+  extra_sources+=communicator/Communicator_mpit.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
@@ -1,137 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/DenseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DENSE_MATRIX_H
 #define GRID_DENSE_MATRIX_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Matrix untils
    /////////////////////////////////////////////////////////////
 template<class T> using DenseVector = std::vector<T>;
 template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
 template<class T> void Size(DenseVector<T> & vec, int &N) 
 { 
  N= vec.size();
 }
 template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
 { 
  N= mat.size();
  M= mat[0].size();
 }
 template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
 { 
  int M; Size(mat,N,M);
  assert(N==M);
 }
 template<class T> void Resize(DenseVector<T > & mat, int N) { 
  mat.resize(N);
 }
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
    mat[i].resize(M);
  }
 }
 template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
  int N,M;
  Size(mat,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    mat[i][j] = val;
  }}
 }
 /** Transpose of a matrix **/
 template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
  int N,M;
  Size(mat,N,M);
  DenseMatrix<T> C; Resize(C,M,N);
  for(int i=0;i<M;i++){
  for(int j=0;j<N;j++){
    C[i][j] = mat[j][i];
  }} 
  return C;
 }
 /** Set DenseMatrix to unit matrix **/
 template<class T> void Unity(DenseMatrix<T> &A){
  int N;  SizeSquare(A,N);
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      if ( i==j ) A[i][j] = 1;
      else        A[i][j] = 0;
    } 
  } 
 }
 /** Add C * I to matrix **/
 template<class T>
 void PlusUnit(DenseMatrix<T> & A,T c){
  int dim;  SizeSquare(A,dim);
  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 }
 /** return the Hermitian conjugate of matrix **/
 template<class T>
 DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
  int dim; SizeSquare(mat,dim);
  DenseMatrix<T> C; Resize(C,dim,dim);
  for(int i=0;i<dim;i++){
    for(int j=0;j<dim;j++){
      C[i][j] = conj(mat[j][i]);
    } 
  } 
  return C;
 }
 /**Get a square submatrix**/
 template <class T>
 DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 {
  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
  for(int i = row_st; i<row_end; i++){
  for(int j = col_st; j<col_end; j++){
    H[i-row_st][j-col_st]=A[i][j];
  }}
  return H;
 }
 }
 #include "Householder.h"
 #include "Francis.h"
 #endif
@@ -1,525 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Francis.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef FRANCIS_H
 #define FRANCIS_H
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 //#include <timer.h>
 //#include <lapacke.h>
 //#include <Eigen/Dense>
 namespace Grid {
 template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 /**
  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
 H =
      x  x  x  x  x  x  x  x  x
      x  x  x  x  x  x  x  x  x
      0  x  x  x  x  x  x  x  x
      0  0  x  x  x  x  x  x  x
      0  0  0  x  x  x  x  x  x
      0  0  0  0  x  x  x  x  x
      0  0  0  0  0  x  x  x  x
      0  0  0  0  0  0  x  x  x
      0  0  0  0  0  0  0  x  x
 Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
 **/
 template <class T>
 int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  DenseMatrix<T> H = Hin; 
  int N ; SizeSquare(H,N);
  int M = N;
  Fill(evals,0);
  Fill(evecs,0);
  T s,t,x=0,y=0,z=0;
  T u,d;
  T apd,amd,bc;
  DenseVector<T> p(N,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
  DenseVector<int> trows(N,0);
  /// Check if the matrix is really hessenberg, if not abort
  RealD sth = 0;
  for(int j=0;j<N;j++){
    for(int i=j+2;i<N;i++){
      sth = abs(H[i][j]);
      if(sth > small){
 	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
 	exit(1);
      }
    }
  }
  do{
    std::cout << "Francis QR Step N = " << N << std::endl;
    /** Check for convergence
      x  x  x  x  x
      0  x  x  x  x
      0  0  x  x  x
      0  0  x  x  x
      0  0  0  0  x
      for this matrix l = 4
     **/
    do{
      l = Chop_subdiag(H,nrm,e,small);
      r = 0;    ///May have converged on more than one eval
      ///Single eval
      if(l == N-1){
        evals[e] = H[l][l];
        N--; e++; r++; it = 0;
      }
      ///RealD eval
      if(l == N-2){
        trows[l+1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l+1][l+1];
        amd = H[l][l] - H[l+1][l+1];
        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
        N-=2; e+=2; r++; it = 0;
      }
    } while(r>0);
    if(N ==0) break;
    DenseVector<T > ck; Resize(ck,3);
    DenseVector<T> v;   Resize(v,3);
    for(int m = N-3; m >= l; m--){
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0){
        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ///Starting vector implicit Q theorem
      else{
        s = (H[N-2][N-2] + H[N-1][N-1]);
        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      if(m == l) break;
      /** Some stupid thing from numerical recipies, seems to work**/
      // PAB.. for heaven's sake quote page, purpose, evidence it works.
      //       what sort of comment is that!?!?!?
      u=abs(H[m][m-1])*(abs(y)+abs(z));
      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
      if ((T)abs(u+d) == (T)abs(d) ){
 	l = m; break;
      }
      //if (u < small){l = m; break;}
    }
    if(it > 100000){
     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, 2, v, beta);
    Householder_mult<T >(H,v,beta,0,l,l+2,0);
    Householder_mult<T >(H,v,beta,0,l,l+2,1);
    ///Accumulate eigenvector
    Householder_mult<T >(P,v,beta,0,l,l+2,1);
    int sw = 0;      ///Are we on the last row?
    for(int k=l;k<N-2;k++){
      x = H[k+1][k];
      y = H[k+2][k];
      z = (T)0.0;
      if(k+3 <= N-1){
 	z = H[k+3][k];
      } else{
 	sw = 1; 
 	v[2] = (T)0.0;
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      normalize(ck);
      Householder_vector<T >(ck, 0, 2-sw, v, beta);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
      ///Accumulate eigenvector
      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp; Resize(tmp,N);
  for(int i=0;i<N;i++){
    tmp[i] = evals[N-i-1];
  } 
  evals = tmp;
  UTeigenvectors(H, trows, evals, evecs);
  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
  return tot_it;
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  /**
  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
  H =
  x  x  0  0  0  0
  x  x  x  0  0  0
  0  x  x  x  0  0
  0  0  x  x  x  0
  0  0  0  x  x  x
  0  0  0  0  x  x
  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
  return my_Wilkinson(Hin, evals, evecs, small, small);
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
 {
  int N; SizeSquare(Hin,N);
  int M = N;
  ///I don't want to modify the input but matricies must be passed by reference
  //Scale a matrix by its "norm"
  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
  DenseMatrix<T> H;  H = Hin;
  RealD Hnorm = abs(Norm(Hin));
  H = H * (1.0 / Hnorm);
  // TODO use openmp and memset
  Fill(evals,0);
  Fill(evecs,0);
  T s, t, x = 0, y = 0, z = 0;
  T u, d;
  T apd, amd, bc;
  DenseVector<T> p; Resize(p,N); Fill(p,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N);
  Unity(P);
  DenseVector<int> trows(N, 0);
  /// Check if the matrix is really symm tridiag
  RealD sth = 0;
  for(int j = 0; j < N; ++j)
  {
    for(int i = j + 2; i < N; ++i)
    {
      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
      {
 	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
 	std::cout << "Warning tridiagonalize and call again" << std::endl;
        // exit(1); // see what is going on
        //return;
      }
    }
  }
  do{
    do{
      //Jasper
      //Check if the subdiagonal term is small enough (<small)
      //if true then it is converged.
      //check start from H.dim - e - 1
      //How to deal with more than 2 are converged?
      //What if Chop_symm_subdiag return something int the middle?
      //--------------
      l = Chop_symm_subdiag(H,nrm, e, small);
      r = 0;    ///May have converged on more than one eval
      //Jasper
      //In this case
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  x  0
      // 0  0  0  x  x  0
      // 0  0  0  0  0  x  <- l
      //--------------
      ///Single eval
      if(l == N - 1)
      {
        evals[e] = H[l][l];
        N--;
        e++;
        r++;
        it = 0;
      }
      //Jasper
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  0  0
      // 0  0  0  0  x  x  <- l
      // 0  0  0  0  x  x
      //--------------
      ///RealD eval
      if(l == N - 2)
      {
        trows[l + 1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l + 1][ l + 1];
        amd = H[l][l] - H[l + 1][l + 1];
        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
        N -= 2;
        e += 2;
        r++;
        it = 0;
      }
    }while(r > 0);
    //Jasper
    //Already converged
    //--------------
    if(N == 0) break;
    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
    for(int m = N - 3; m >= l; m--)
    {
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0)
      {
        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
        x = H[m][m] - t;
        z = H[m + 1][m];
      } else {
      ///Starting vector implicit Q theorem
        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
 	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
        x = H[m][m] - t;
        z = H[m + 1][m];
      }
      //Jasper
      //why it is here????
      //-----------------------
      if(m == l)
        break;
      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
      if ((T)abs(u + d) == (T)abs(d))
      {
        l = m;
        break;
      }
    }
    //Jasper
    if(it > 1000000)
    {
      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    //
    T s, c;
    Givens_calc<T>(x, z, c, s);
    Givens_mult<T>(H, l, l + 1, c, -s, 0);
    Givens_mult<T>(H, l, l + 1, c,  s, 1);
    Givens_mult<T>(P, l, l + 1, c,  s, 1);
    //
    for(int k = l; k < N - 2; ++k)
    {
      x = H.A[k + 1][k];
      z = H.A[k + 2][k];
      Givens_calc<T>(x, z, c, s);
      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp(N);
  for(int i = 0; i < N; ++i)
    tmp[i] = evals[N-i-1];
  evals = tmp;
  //
  UTeigenvectors(H, trows, evals, evecs);
  //UTSymmEigenvectors(H, trows, evals, evecs);
  for(int i = 0; i < evals.size(); ++i)
  {
    evecs[i] = P * evecs[i];
    normalize(evecs[i]);
    evals[i] = evals[i] * Hnorm;
  }
  // // FIXME this is to test
  // Hin.write("evecs3", evecs);
  // Hin.write("evals3", evals);
  // // check rsd
  // for(int i = 0; i < M; i++) {
  //   vector<T> Aevec = Hin * evecs[i];
  //   RealD norm2(0.);
  //   for(int j = 0; j < M; j++) {
  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
  //   }
  // }
  return tot_it;
 }
 template <class T>
 void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
  /**
  turn a matrix A =
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  into
  x  x  x  x  x
  x  x  x  x  x
  0  x  x  x  x
  0  0  x  x  x
  0  0  0  x  x
  with householder rotations
  Slow.
  */
  int N ; SizeSquare(A,N);
  DenseVector<T > p; Resize(p,N); Fill(p,0);
  for(int k=start;k<N-2;k++){
    //cerr << "hess" << k << std::endl;
    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
    ///Accumulate eigenvector
    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
  }
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,k,l);
    }
    }*/
 }
 template <class T>
 void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
 ///Tridiagonalize a matrix
  int N; SizeSquare(A,N);
  Hess(A,Q,start);
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,l,k);
    }
    }*/
 }
 template <class T>
 void ForceTridiagonal(DenseMatrix<T> &A){
 ///Tridiagonalize a matrix
  int N ; SizeSquare(A,N);
  for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
      A[l][k]=0;
      A[k][l]=0;
    }
  }
 }
 template <class T>
 int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
  int N; SizeSquare(Ain,N);
  DenseMatrix<T > A; A = Ain;
  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
  Tri(A,Q,0);
  int it = my_Wilkinson<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 template <class T>
 int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_Wilkinson(Ain, evals, evecs, small);
 }
 template <class T>
 int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_SymmEigensystem(Ain, evals, evecs, small);
 }
 template <class T>
 int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 ///Solve a general eigensystem, not necessarily in tridiagonal form
  int N = Ain.dim;
  DenseMatrix<T > A(N); A = Ain;
  DenseMatrix<T > Q(N);Q.Unity();
  Hess(A,Q,0);
  int it = QReigensystem<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 }
 #endif
@@ -1,242 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Householder.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef HOUSEHOLDER_H
 #define HOUSEHOLDER_H
 #define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 namespace Grid {
 /** Comparison function for finding the max element in a vector **/
 template <class T> bool cf(T i, T j) { 
  return abs(i) < abs(j); 
 }
 /** 
 	Calculate a real Givens angle 
 **/
 template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
  RealD mz = (RealD)abs(z);
  if(mz==0.0){
    c = 1; s = 0;
  }
  if(mz >= (RealD)abs(y)){
    T t = -y/z;
    s = (T)1.0 / sqrt ((T)1.0 + t * t);
    c = s * t;
  } else {
    T t = -z/y;
    c = (T)1.0 / sqrt ((T)1.0 + t * t);
    s = c * t;
  }
 }
 template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
 {
  int q ; SizeSquare(A,q);
  if(dir == 0){
    for(int j=0;j<q;j++){
      T nu = A[i][j];
      T w  = A[k][j];
      A[i][j] = (c*nu + s*w);
      A[k][j] = (-s*nu + c*w);
    }
  }
  if(dir == 1){
    for(int j=0;j<q;j++){
      T nu = A[j][i];
      T w  = A[j][k];
      A[j][i] = (c*nu - s*w);
      A[j][k] = (s*nu + c*w);
    }
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	P | x |    | x | k = 0
 	| x |    | 0 | 
 	| x | =  | 0 |
 	| x |    | 0 | j = 3
 	| x |	   | x |
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
 {
  int N ; Size(input,N);
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
    else                 v[k] = -alpha;
  } else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	Px = alpha*e_dir
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
 {
  int N = input.size();
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
    else                  v[dir] = -alpha;
  }else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 **/
 template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
 {
  int N ; SizeSquare(A,N);
  if(abs(beta) > 0.0){
    for(int p=l; p<N; p++){
      T s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
      } else {
 	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
      }
    }
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 	A is tridiagonal
 **/
 template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
 {
  if(abs(beta) > 0.0){
    int N ; SizeSquare(A,N);
    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
    T s;
    for(int p=l; p<M; p++){
      s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
      }else{
 	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
      }
      s = beta*s;
      if(trans==0){
 	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
      }else{
 	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
      }
    }
    for(int p=l; p<M; p++){
      if(trans==0){
 	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
      }else{
 	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
      }
    }
  }
 }
 }
 #endif
@@ -33,6 +33,8 @@ directory
 namespace Grid {
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
@@ -40,25 +42,280 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
-  const int blockDim = 0;
+  int blockDim ;
  int Nblock;
  BlockCGtype CGtype;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
-    MaxIterations(maxit),
+  {};
    ErrorOnNoConverge(err_on_no_conv){};
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void ThinQRfact (Eigen::MatrixXcd &m_rr,
 		 Eigen::MatrixXcd &C,
 		 Eigen::MatrixXcd &Cinv,
 		 Field & Q,
 		 const Field & R)
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  //Dimensions
  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
  //
  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
  //
  //   Q  C = R => Q = R C^{-1}
  //
  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
  //
  // Set C = L^{dag}, and then Q^dag Q = ident 
  //
  // Checks:
  // Cdag C = Rdag R ; passes.
  // QdagQ  = 1      ; passes
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Cholesky from Eigen
  // There exists a ldlt that is documented as more stable
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
  C    = L.adjoint();
  Cinv = C.inverse();
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
  // Q_j  = R_i Cinv(i,j) 
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // FIXME:: make a sliceMulMatrix to avoid zero vector
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Call one of several implementations
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  if ( CGtype == BlockCGrQ ) {
    BlockCGrQsolve(Linop,Src,Psi);
  } else if (CGtype == BlockCG ) {
    BlockCGsolve(Linop,Src,Psi);
  } else if (CGtype == CGmultiRHS ) {
    CGmultiRHSsolve(Linop,Src,Psi);
  } else {
    assert(0);
  }
 }
 ////////////////////////////////////////////////////////////////////////////
 // BlockCGrQ implementation:
 //--------------------------
 // X is guess/Solution
 // B is RHS
 // Solve A X_i = B_i    ;        i refers to Nblock index
 ////////////////////////////////////////////////////////////////////////////
 void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = B._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
  X.checkerboard = B.checkerboard;
  conformable(X, B);
  Field tmp(B);
  Field Q(B);
  Field D(B);
  Field Z(B);
  Field AD(B);
  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);
  sliceNorm(ssq,B,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
  sliceNorm(residuals,B,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  sliceNorm(residuals,X,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
  /************************************************************************
   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
   ************************************************************************
   * Dimensions:
   *
   *   X,B==(Nferm x Nblock)
   *   A==(Nferm x Nferm)
   *  
   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
   * 
   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
   * for k: 
   *   Z  = AD
   *   M  = [D^dag Z]^{-1}
   *   X  = X + D MC
   *   QS = Q - ZM
   *   D  = Q + D S^dag
   *   C  = S C
   */
  ///////////////////////////////////////
  // Initial block: initial search dir is guess
  ///////////////////////////////////////
  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
  Linop.HermOp(X, AD);
  tmp = B - AD;  
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
  ///////////////////////////////////////
  // Timers
  ///////////////////////////////////////
  GridStopWatch sliceInnerTimer;
  GridStopWatch sliceMaddTimer;
  GridStopWatch QRTimer;
  GridStopWatch MatrixTimer;
  GridStopWatch SolverTimer;
  SolverTimer.Start();
  int k;
  for (k = 1; k <= MaxIterations; k++) {
    //3. Z  = AD
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
    sliceMaddTimer.Stop();
    //6. QS = Q - ZM
    sliceMaddTimer.Start();
    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
    sliceMaddTimer.Stop();
    QRTimer.Start();
    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
    QRTimer.Stop();
    //7. D  = Q + D S^dag
    m_tmp = m_S.adjoint();
    sliceMaddTimer.Start();
    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
    sliceMaddTimer.Stop();
    //8. C  = S C
    m_C = m_S*m_C;
    /*********************
     * convergence monitor
     *********************
     */
    m_rr = m_C.adjoint() * m_C;
    RealD max_resid=0;
    RealD rrsum=0;
    RealD rr;
    for(int b=0;b<Nblock;b++) {
      rrsum+=real(m_rr(b,b));
      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
 	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
    if ( max_resid < Tolerance*Tolerance ) { 
      SolverTimer.Stop();
      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
 	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(X, AD);
      AD = AD-B;
      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
      IterationsToComplete = k;
      return;
    }
  }
  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
  int Orthog = blockDim; // First dimension is block dim; this is an assumption
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@@ -162,8 +419,9 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
     *********************
     */
    RealD max_resid=0;
    RealD rr;
    for(int b=0;b<Nblock;b++){
-      RealD rr = real(m_rr(b,b))/ssq[b];
+      rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
@@ -173,13 +431,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
      std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
 		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
      Linop.HermOp(Psi, AP);
      AP = AP-Src;
-      std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -197,35 +456,13 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
 // Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
-template <class Field>
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 class MultiRHSConjugateGradient : public OperatorFunction<Field> {
 public:
  typedef typename Field::scalar_type scomplex;
  const int blockDim = 0;
  int Nblock;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
    MaxIterations(maxit),
    ErrorOnNoConverge(err_on_no_conv){};
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
-  int Orthog = 0; // First dimension is block dim
+  int Orthog = blockDim; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@@ -285,12 +522,10 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
    MatrixTimer.Stop();
    // Alpha
    //    sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
    sliceInnerTimer.Start();
    sliceInnerProductVector(v_pAp,P,AP,Orthog);
    sliceInnerTimer.Stop();
    for(int b=0;b<Nblock;b++){
      //      std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
    }
@@ -332,7 +567,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
      for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
      }
      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
@@ -358,9 +593,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
 }
 };
 }
 #endif
@@ -1,81 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/EigenSort.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_EIGENSORT_H
 #define GRID_EIGENSORT_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Eigen sorter to begin with
    /////////////////////////////////////////////////////////////
 template<class Field>
 class SortEigen {
 private:
 //hacking for testing for now
 private:
  static bool less_lmd(RealD left,RealD right){
    return left > right;
  }  
  static bool less_pair(std::pair<RealD,Field const*>& left,
                        std::pair<RealD,Field const*>& right){
    return left.first > (right.first);
  }  
 public:
  void push(DenseVector<RealD>& lmd,
            DenseVector<Field>& evec,int N) {
    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
    for(int i=0;i<lmd.size();++i)
      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
      evec[i]=*(it->second);
      ++it;
    }
  }
  void push(DenseVector<RealD>& lmd,int N) {
    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
  }
  bool saturated(RealD lmd, RealD thrs) {
    return fabs(lmd) > fabs(thrs);
  }
 };
 }
 #endif
@@ -11,7 +11,7 @@ int PointerCache::victim;
 void *PointerCache::Insert(void *ptr,size_t bytes) {
-  if (bytes < 4096 ) return NULL;
+  if (bytes < 4096 ) return ptr;
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -92,18 +92,34 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
-
+    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
 #pragma omp parallel for
 #endif
    for(size_type n=0;n<bytes;n+=4096){
      cp[n]=0;
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@@ -182,10 +198,19 @@ public:
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
 #pragma omp parallel for schedule(static)
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
    }
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
@@ -185,17 +185,18 @@ public:
    ////////////////////////////////////////////////////////////////
    void show_decomposition(){
-      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
-      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
-      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
-      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
-      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
-      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
-      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
-      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
-      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
-      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
-      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 
    ////////////////////////////////////////////////////////////////
@@ -62,77 +62,81 @@ public:
      return shift;
    }
    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
+                  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid
+                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
 		  ) : GridBase(processor_grid)
    {
-        ///////////////////////
+      ///////////////////////
-        // Grid information
+      // Grid information
-        ///////////////////////
+      ///////////////////////
-        _ndimension = dimensions.size();
+      _ndimension = dimensions.size();
-        _fdimensions.resize(_ndimension);
+      _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
-	_lstart.resize(_ndimension);
+      _lstart.resize(_ndimension);
-	_lend.resize(_ndimension);
+      _lend.resize(_ndimension);
-        _ostride.resize(_ndimension);
+      _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
+      _istride.resize(_ndimension);
-        _fsites = _gsites = _osites = _isites = 1;
+      _fsites = _gsites = _osites = _isites = 1;
-        for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
+      {
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _fdimensions[d] = dimensions[d];   // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _fsites = _fsites * _fdimensions[d];
+        _simd_layout[d] = simd_layout[d];
-	  _gsites = _gsites * _gdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	  //FIXME check for exact division
+        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
-	  // Use a reduced simd grid
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
 	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
 	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 	  _osites  *= _rdimensions[d];
 	  _isites  *= _simd_layout[d];
-	  // Addressing support
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
-	  if ( d==0 ) {
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	    _ostride[d] = 1;
+        _osites *= _rdimensions[d];
-	    _istride[d] = 1;
+        _isites *= _simd_layout[d];
-	  } else {
+
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+        // Addressing support
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
+        if (d == 0)
-	  }
+        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
-        
+        else
-        ///////////////////////
+        {
-        // subplane information
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
-        ///////////////////////
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        _slice_block.resize(_ndimension);
        _slice_stride.resize(_ndimension);
        _slice_nblock.resize(_ndimension);
        int block =1;
        int nblock=1;
        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
        for(int d=0;d<_ndimension;d++){
            nblock/=_rdimensions[d];
            _slice_block[d] =block;
            _slice_stride[d]=_ostride[d]*_rdimensions[d];
            _slice_nblock[d]=nblock;
            block = block*_rdimensions[d];
        }
      }
      ///////////////////////
      // subplane information
      ///////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
      int block = 1;
      int nblock = 1;
      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
      for (int d = 0; d < _ndimension; d++)
      {
        nblock /= _rdimensions[d];
        _slice_block[d] = block;
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
    };
 };
 }
 #endif
@@ -131,20 +131,20 @@ public:
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
    }
    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
+              const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
+              const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
+              const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              int checker_dim)
    {
-    ///////////////////////
+      ///////////////////////
-    // Grid information
+      // Grid information
-    ///////////////////////
+      ///////////////////////
      _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
+      assert(checker_dim_mask.size() == _ndimension);
-      assert(processor_grid.size()==_ndimension);
+      assert(processor_grid.size() == _ndimension);
-      assert(simd_layout.size()==_ndimension);
+      assert(simd_layout.size() == _ndimension);
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
@@ -159,47 +159,55 @@ public:
      _fsites = _gsites = _osites = _isites = 1;
-      _checker_dim_mask=checker_dim_mask;
+      _checker_dim_mask = checker_dim_mask;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	_fdimensions[d] = dimensions[d];
+      {
-	_gdimensions[d] = _fdimensions[d];
+        _fdimensions[d] = dimensions[d];
-	_fsites = _fsites * _fdimensions[d];
+        _gdimensions[d] = _fdimensions[d];
-	_gsites = _gsites * _gdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
        _gsites = _gsites * _gdimensions[d];
-	if (d==_checker_dim) {
+        if (d == _checker_dim)
-	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
+        {
-	}
+          assert((_gdimensions[d] & 0x1) == 0);
-	_ldimensions[d] = _gdimensions[d]/_processors[d];
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
-	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
+        }
-	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
-	// Use a reduced simd grid
+        // Use a reduced simd grid
-	_simd_layout[d] = simd_layout[d];
+        _simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
-	assert(_rdimensions[d]>0);
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
        assert(_rdimensions[d] > 0);
-	// all elements of a simd vector must have same checkerboard.
+        // all elements of a simd vector must have same checkerboard.
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
-	if ( _simd_layout[d]>1 ) {
+        if (_simd_layout[d] > 1)
-	  if ( checker_dim_mask[d] ) { 
+        {
-	    assert( (_rdimensions[d]&0x1) == 0 );
+          if (checker_dim_mask[d])
-	  }
+          {
-	}
+            assert((_rdimensions[d] & 0x1) == 0);
-
+          }
-	_osites *= _rdimensions[d];
+        }
 	_isites *= _simd_layout[d];
 	// Addressing support
 	if ( d==0 ) {
 	  _ostride[d] = 1;
 	  _istride[d] = 1;
 	} else {
 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 	}
        _osites *= _rdimensions[d];
        _isites *= _simd_layout[d];
        // Addressing support
        if (d == 0)
        {
          _ostride[d] = 1;
          _istride[d] = 1;
        }
        else
        {
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
      }
      ////////////////////////////////////////////////////////////////////////////////////////////
@@ -209,58 +217,69 @@ public:
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
-      int block =1;
+      int block = 1;
-      int nblock=1;
+      int nblock = 1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
        nblock *= _rdimensions[d];
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	nblock/=_rdimensions[d];
+      {
-	_slice_block[d] =block;
+        nblock /= _rdimensions[d];
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
+        _slice_block[d] = block;
-	_slice_nblock[d]=nblock;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
-	block = block*_rdimensions[d];
+        _slice_nblock[d] = nblock;
        block = block * _rdimensions[d];
      }
      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
+      for (int d = 0; d < _ndimension; d++)
-	rvol=rvol * _rdimensions[d];
+      {
        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
+      for (int osite = 0; osite < _osites; osite++)
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      {
        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
    };
-protected:
+
  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
-      int idx=0;
+      int idx = 0;
-      for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	if( d==_checker_dim ) {
+      {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
+        if (d == _checker_dim)
-	} else {
+        {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
-	}
+        }
        else
        {
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
        }
      }
      return idx;
    };
    virtual int iIndex(std::vector<int> &lcoor)
    {
-        int idx=0;
+      int idx = 0;
-        for(int d=0;d<_ndimension;d++) {
+      for (int d = 0; d < _ndimension; d++)
-	  if( d==_checker_dim ) {
+      {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
+        if (d == _checker_dim)
-	  } else { 
+        {
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
-	  }
+        }
-	}
+        else
-        return idx;
+        {
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
        }
      }
      return idx;
    }
 };
 }
 #endif
@@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
@@ -33,8 +37,11 @@ namespace Grid {
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
-uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 int CartesianCommunicator::Hugepages = 0;
 /////////////////////////////////
 // Alloc, free shmem region
@@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }
-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
+#if !defined( GRID_COMMS_MPI3) 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-
+#endif
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
-						       void *xmit,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       int xmit_to_rank,
+						     int xmit_to_rank,
-						       void *recv,
+						     void *recv,
-						       int recv_from_rank,
+						     int recv_from_rank,
-						       int bytes)
+						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 #endif
 #if !defined( GRID_COMMS_MPI3) 
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
 #ifdef MAP_HUGETLB
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
 #endif
 #else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 #endif
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 }
 #endif
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -50,12 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
  public:    
-  // 65536 ranks per node adequate for now
+
  ////////////////////////////////////////////
  // Isend/Irecv/Wait, or Sendrecv blocking
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  ///////////////////////////////////////////
  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-
+  ///////////////////////////////////////////
-  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const int MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
  // use explicit huge pages
  static int       Hugepages;
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -64,14 +76,18 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  static MPI_Comm communicator_world;
-         MPI_Comm communicator;
+
  MPI_Comm              communicator;
  std::vector<MPI_Comm> communicator_halo;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
@@ -117,10 +133,6 @@ class CartesianCommunicator {
  /////////////////////////////////
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  size_t heap_top;
  size_t heap_bytes;
@@ -211,14 +223,21 @@ class CartesianCommunicator {
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  double StencilSendToRecvFrom(void *xmit,
-				  void *xmit,
+			       int xmit_to_rank,
-				  int xmit_to_rank,
+			       void *recv,
-				  void *recv,
+			       int recv_from_rank,
-				  int recv_from_rank,
+			       int bytes,int dir);
 				  int bytes);
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
 				    void *recv,
 				    int recv_from_rank,
 				    int bytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
@@ -37,11 +37,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
-#ifndef SHM_HUGETLB
+#ifdef HAVE_NUMAIF_H
-#define SHM_HUGETLB 04000
+#include <numaif.h>
 #endif
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -197,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmCommBuf = 0;
  ShmCommBufs.resize(ShmSize);
-#if 1
+  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMMMAP
  char shm_name [NAME_MAX];
  for(int r=0;r<ShmSize;r++){
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
    //    printf("Opening file %s \n",shm_name);
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
    if ( fd == -1) { 
      printf("open %s failed\n",shm_name);
      perror("open hugetlbfs");
      exit(0);
    }
    int mmap_flag = MAP_SHARED ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    ShmCommBufs[r] =ptr;
  }
 #endif
  ////////////////////////////////////////////////////////////////////////////////////////////
  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
  // the posix shm virtual file system
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMOPEN
  char shm_name [NAME_MAX];
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
@@ -211,10 +251,38 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      int mmap_flag = MAP_SHARED;
-      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+#ifdef MAP_POPULATE 
      mmap_flag |= MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
-      ShmCommBufs[r] =ptr;
+
 // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
 #if 0
 //#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
 	int nodes=1; // numa domain == MCDRAM
 	// Find out if in SNC2,SNC4 mode ?
 #else
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
 	for(uint64_t page=0;page<size;page+=4096){
 	  void *pages = (void *) ( page + (uint64_t)ptr );
 	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
 	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 	  if (ierr && (page==0)) perror("numa relocate command failed");
 	}
 #endif
 	ShmCommBufs[r] =ptr;
    }
  }
@@ -236,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      ShmCommBufs[r] =ptr;
    }
  }
-
+#endif
-#else
+  ////////////////////////////////////////////////////////////////////////////////////////////
  // SHMGET SHMAT and SHM_HUGETLB flag
  ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMGET
  std::vector<int> shmids(ShmSize);
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-      key_t key   = 0x4545 + r;
+      key_t key   = IPC_PRIVATE;
-      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      if (Hugepages) flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	printf("key   %d\n",key);
 	printf("size  %lld\n",size);
 	printf("flags %d\n",flags);
 	perror("shmget");
 	exit(1);
      } else { 
 	printf("shmid: 0x%x\n", shmids[r]);
      }
      printf("shmid: 0x%x\n", shmids[r]);
    }
  }
  MPI_Barrier(ShmComm);
@@ -375,8 +454,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@@ -599,13 +684,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  }
 }
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       void *xmit,
+						     int dest,
-						       int dest,
+						     void *recv,
-						       void *recv,
+						     int from,
-						       int from,
+						     int bytes,int dir)
 						       int bytes)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  assert(dir < communicator_halo.size());
  MPI_Request xrq;
  MPI_Request rrq;
@@ -624,26 +723,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
@@ -0,0 +1,286 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  int Size; 
  MPI_Comm_size(communicator,&Size);
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  list.push_back(req[0]);
  list.push_back(req[1]);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
  int nreq=waitall.size();
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 };
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
 						    void *recv,
 						    int recv_from_rank,
 						    int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
 }
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
@@ -1,4 +1,4 @@
- /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_reduction.h
    Copyright (C) 2015
@@ -369,71 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
 template<class vobj>
 static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
 			     int Orthog,RealD scale=1.0) 
 {    
  // FIXME: Implementation is slow
  // Best base the linear combination by constructing a 
  // set of vectors of size grid->_rdimensions[Orthog].
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  Lattice<vobj> Xslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);
  // If we based this on Cshift it would work for spread out
  // but it would be even slower
  for(int i=0;i<Nblock;i++){
    ExtractSlice(Rslice,Y,i,Orthog);
    ExtractSlice(Xslice,X,i,Orthog);
    Rslice = Rslice + Xslice*(scale*a[i]);
    InsertSlice(Rslice,R,i,Orthog);
  }
 };
 template<class vobj>
 static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
  {
    // FIXME: Implementation is slow
    // Look at localInnerProduct implementation,
    // and do inside a site loop with block strided iterators
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    typedef typename vobj::tensor_reduced scalar;
    typedef typename scalar::scalar_object  scomplex;
    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
    vec.resize(Nblock);
    std::vector<scomplex> sip(Nblock);
    Lattice<scalar> IP(lhs._grid); 
    IP=localInnerProduct(lhs,rhs);
    sliceSum(IP,sip,Orthog);
    for(int ss=0;ss<Nblock;ss++){
      vec[ss] = TensorRemove(sip[ss]);
    }
  }
 */
 //////////////////////////////////////////////////////////////////////////////////////////
 // FIXME: Implementation is slow
 // If we based this on Cshift it would work for spread out
 // but it would be even slower
 //
 // Repeated extract slice is inefficient
 //
 // Best base the linear combination by constructing a 
 // set of vectors of size grid->_rdimensions[Orthog].
 //////////////////////////////////////////////////////////////////////////////////////////
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -452,7 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
-
+*/
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -464,55 +400,168 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
-  for(int i=0;i<Nblock;i++){
+  assert( FullGrid->_simd_layout[Orthog]==1);
-    ExtractSlice(Rslice,Y,i,Orthog);
+  int nh =  FullGrid->_ndimension;
-    for(int j=0;j<Nblock;j++){
+  //  int nl = SliceGrid->_ndimension;
-      ExtractSlice(Xslice,X,j,Orthog);
+  int nl = nh-1;
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
+
-    }
+  //FIXME package in a convenient iterator
-    InsertSlice(Rslice,R,i,Orthog);
+  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }
      vobj dot;
      for(int i=0;i<Nblock;i++){
 	dot = s_x[0]*(scale*aa(0,i));
 	for(int j=1;j<Nblock;j++){
 	  dot = dot + s_x[j]*(scale*aa(j,i));
 	}
 	R[o+i*ostride]=dot;
      }
    }}
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  // FIXME: Implementation is slow
  // Not sure of best solution.. think about it
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-  for(int i=0;i<Nblock;i++){
+  assert( FullGrid->_simd_layout[Orthog]==1);
-    ExtractSlice(Lslice,lhs,i,Orthog);
+  int nh =  FullGrid->_ndimension;
-    for(int j=0;j<Nblock;j++){
+  //  int nl = SliceGrid->_ndimension;
-      ExtractSlice(Rslice,rhs,j,Orthog);
+  int nl = nh-1;
-      mat(i,j) = innerProduct(Lslice,Rslice);
+
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
 #pragma omp parallel 
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
    for(int b=0;b<block;b++){
      int o  = n*stride + b;
      for(int i=0;i<Nblock;i++){
 	Left [i] = lhs[o+i*ostride];
 	Right[i] = rhs[o+i*ostride];
      }
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
 	//	vector_typeD rtmp = TensorRemove(tmp);
 	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
      }}
    }}
 #pragma omp critical
    {
      mat += mat_thread;
    }  
  }
-#undef FORCE_DIAG
+
 #ifdef FORCE_DIAG
  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
+  for(int j=0;j<Nblock;j++){
-      if ( i != j ) mat(i,j)=0.0;
+    ComplexD sum = mat(i,j);
-    }
+    FullGrid->GlobalSum(sum);
-  }
+    mat(i,j)=sum;
-#endif
+  }}
  return;
 }
@@ -62,17 +62,12 @@ namespace Grid {
    return ret;
  }
-  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    obj unit(1.0);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
-      //ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
+      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
      ret._odata[ss] = unit;
      for(int i=Nexp; i>=1;--i)
 	ret._odata[ss] = unit + ret._odata[ss]*rhs._odata[ss]*(alpha/RealD(i));
    }
    return ret;
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -29,7 +29,7 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
@@ -99,34 +99,38 @@ class BinaryIO {
    NerscChecksum(grid,scalardata,nersc_csum);
  }
-  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
+  template <class fobj>
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
  {
-    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
-
+    uint64_t lsites = grid->lSites();
-    uint64_t lsites              =grid->lSites();
+    if (fbuf.size() == 1)
-    if (fbuf.size()==1) {
+    {
-      lsites=1;
+      lsites = 1;
    }
-#pragma omp parallel
+    #pragma omp parallel
    {
-      uint32_t nersc_csum_thr=0;
+      uint32_t nersc_csum_thr = 0;
-#pragma omp for
+      #pragma omp for
-      for(uint64_t local_site=0;local_site<lsites;local_site++){
+      for (uint64_t local_site = 0; local_site < lsites; local_site++)
-	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
+      {
-	for(uint64_t j=0;j<size32;j++){
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
-	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
+        for (uint64_t j = 0; j < size32; j++)
-	}
+        {
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
        }
      }
-#pragma omp critical
+      #pragma omp critical
      {
-	nersc_csum  += nersc_csum_thr;
+        nersc_csum += nersc_csum_thr;
      }
    }
  }
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
  {
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@@ -363,17 +367,21 @@ class BinaryIO {
 	assert(0);
 #endif
      } else {
-	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
+        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
-	std::ifstream fin;
+        std::ifstream fin;
-	fin.open(file,std::ios::binary|std::ios::in);
+        fin.open(file, std::ios::binary | std::ios::in);
-	if ( control & BINARYIO_MASTER_APPEND )  {
+        if (control & BINARYIO_MASTER_APPEND)
-	  fin.seekg(-sizeof(fobj),fin.end);
+        {
-	} else { 
+          fin.seekg(-sizeof(fobj), fin.end);
-	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
+        }
-	}
+        else
-	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
+        {
-	fin.close();
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
        }
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
        assert(fin.fail() == 0);
        fin.close();
      }
      timer.Stop();
@@ -405,30 +413,78 @@ class BinaryIO {
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
+        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
+        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
+        std::cout << GridLogMessage << "Checking for errors" << std::endl;
-	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
+        if (ierr != MPI_SUCCESS)
-	MPI_File_close(&fh);
+        {
-	MPI_Type_free(&fileArray);
+          char error_string[BUFSIZ];
-	MPI_Type_free(&localArray);
+          int length_of_error_string, error_class;
          MPI_Error_class(ierr, &error_class);
          MPI_Error_string(error_class, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Error_string(ierr, error_string, &length_of_error_string);
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
        }
        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
        assert(ierr == 0);
        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
        assert(ierr == 0);
        MPI_File_close(&fh);
        MPI_Type_free(&fileArray);
        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else { 
-	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+        
-	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
+	std::ofstream fout; 
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
-	if ( control & BINARYIO_MASTER_APPEND )  {
+  try {
    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
  } catch (const std::fstream::failure& exc) {
    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
    #ifdef USE_MPI_IO
    MPI_Abort(MPI_COMM_WORLD,1);
    #else
    exit(1);
    #endif
  }
 	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
 		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
  if ( control & BINARYIO_MASTER_APPEND )  {
 	  fout.seekp(0,fout.end);
 	} else {
 	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	}
-	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
+  
  try {
  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
  }
  catch (const std::fstream::failure& exc) {
    std::cout << "Exception in writing file " << file << std::endl;
    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
    #ifdef USE_MPI_IO
    MPI_Abort(MPI_COMM_WORLD,1);
    #else
    exit(1);
    #endif
  }
 	fout.close();
-      }
+  }
-      timer.Stop();
+  timer.Stop();
-    }
+  }
    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
@@ -442,11 +498,14 @@ class BinaryIO {
    //////////////////////////////////////////////////////////////////////////////
    // Safety check
    //////////////////////////////////////////////////////////////////////////////
-    grid->Barrier();
+    // if the data size is 1 we do not want to sum over the MPI ranks
-    grid->GlobalSum(nersc_csum);
+    if (iodata.size() != 1){
-    grid->GlobalXOR(scidac_csuma);
+      grid->Barrier();
-    grid->GlobalXOR(scidac_csumb);
+      grid->GlobalSum(nersc_csum);
-    grid->Barrier();
+      grid->GlobalXOR(scidac_csuma);
      grid->GlobalXOR(scidac_csumb);
      grid->Barrier();
    }
  }
  /////////////////////////////////////////////////////////////////////////////
@@ -546,9 +605,9 @@ class BinaryIO {
    int gsites = grid->gSites();
    int lsites = grid->lSites();
-    uint32_t nersc_csum_tmp;
+    uint32_t nersc_csum_tmp   = 0;
-    uint32_t scidac_csuma_tmp;
+    uint32_t scidac_csuma_tmp = 0;
-    uint32_t scidac_csumb_tmp;
+    uint32_t scidac_csumb_tmp = 0;
    GridStopWatch timer;
@@ -598,9 +598,14 @@ class IldgReader : public GridLimeReader {
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
-	  XmlReader RD(&xmlc[0],"");
+	  std::string xmls(&xmlc[0]);
-	  read(RD,"usqcdInfo",usqcdInfo_);
+	  // is it a USQCD info field
-	  found_usqcdInfo = 1;
+	  if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) { 
 	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
 	    XmlReader RD(&xmlc[0],"");
 	    read(RD,"usqcdInfo",usqcdInfo_);
 	    found_usqcdInfo = 1;
 	  }
 	}
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef AVX512
+#ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
@@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
-    //    assert(fabs(omega[i])>0.0);
+    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0;i<Ls;i++){
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    //    assert(fabs(bee[i])>0.0);
+    assert(bee[i]!=Coeff_t(0.0));
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
    beo[i]=as[i]*bs[i];
    ceo[i]=-as[i]*cs[i];
@@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    if ( i < Ls-1 ) {
      assert(bee[i]!=Coeff_t(0.0));
      assert(bee[0]!=Coeff_t(0.0));
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
      }
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
@@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  { 
    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
-      //      assert(fabs(bee[j])>0.0);
+      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
    }
    dee[Ls-1] += delta_d;
@@ -237,4 +237,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
  }}
 ////////////////////
 // Scalar QED actions
 // TODO: this needs to move to another header after rename to Fermion.h
 ////////////////////
 #include <Grid/qcd/action/scalar/Scalar.h>
 #include <Grid/qcd/action/gauge/Photon.h>
 #endif
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
  Compressor compressor;
  int LLs = in._grid->_rdimensions[0];
  st.HaloExchange(in,compressor);
  DhopTotalTime -= usecond();
  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
  DhopCommTime += usecond();
  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
    }
  }
  DhopComputeTime += usecond();
  DhopTotalTime   += usecond();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 {
  std::vector<int> latt = GridDefaultLatt();          
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP = _FourDimGrid->_Nprocessors;
  RealD NN = _FourDimGrid->NodeCount();
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 	    << DhopCalls   << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 	    << DhopCommTime    / DhopCalls << " us" << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 	    << DhopComputeTime / DhopCalls << " us" << std::endl;
  // Average the compute time
  _FourDimGrid->GlobalSum(DhopComputeTime);
  DhopComputeTime/=NP;
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 {
  DhopCalls       = 0;
  DhopTotalTime    = 0;
  DhopCommTime    = 0;
  DhopComputeTime = 0;
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
 }
 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
@@ -55,6 +55,16 @@ namespace QCD {
      FermionField _tmp;
      FermionField &tmp(void) { return _tmp; }
      ////////////////////////////////////////
      // Performance monitoring
      ////////////////////////////////////////
      void Report(void);
      void ZeroCounters(void);
      double DhopTotalTime;
      double DhopCalls;
      double DhopCommTime;
      double DhopComputeTime;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 template<class vobj,class cobj>
 class WilsonStencil : public CartesianStencil<vobj,cobj> {
 public:
-
+  double timer0;
  double timer1;
  double timer2;
  double timer3;
  double timer4;
  double timer5;
  double timer6;
  uint64_t callsi;
  void ZeroCountersi(void)
  {
    timer0=0;
    timer1=0;
    timer2=0;
    timer3=0;
    timer4=0;
    timer5=0;
    timer6=0;
    callsi=0;
  }
  void Reporti(int calls)
  {
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  std::vector<int> same_node;
@@ -252,6 +278,7 @@ public:
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
    same_node(npoints)
  { 
    ZeroCountersi();
    surface_list.resize(0);
  };
@@ -261,7 +288,6 @@ public:
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      same_node[point] = this->SameNode(point);
      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
    }
    for(int site = 0 ;site< vol4;site++){
@@ -282,17 +308,28 @@ public:
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->HaloExchangeOptGather(source,compress);
-    this->CommunicateBegin(reqs);
+    double t1=usecond();
-    this->CommunicateComplete(reqs);
+    // Asynchronous MPI calls multidirectional, Isend etc...
    //    this->CommunicateBegin(reqs);
    //    this->CommunicateComplete(reqs);
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
    this->CommsMerge(compress);
    double t3=usecond(); timer2 += t3-t2;
    this->CommsMergeSHM(compress);
    double t4=usecond(); timer3 += t4-t3;
  }
  template <class compressor>
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
  {
    this->Prepare();
    double t0=usecond();
    this->HaloGatherOpt(source,compress);
    double t1=usecond();
    timer0 += t1-t0;
    callsi++;
  }
  template <class compressor>
@@ -304,7 +341,9 @@ public:
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
    this->mpi3synctime_g-=usecond();
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();
    assert(source._grid==this->_grid);
    this->halogtime-=usecond();
@@ -323,7 +362,6 @@ public:
    int dag = compress.dag;
    int face_idx=0;
    if ( dag ) { 
      //	std::cout << " Optimised Dagger compress " <<std::endl;
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
@@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  int vol4;
  vol4=FourDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4=FourDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
-  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
+  RealD NP     = _FourDimGrid->_Nprocessors;
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NN     = _FourDimGrid->NodeCount();
-    RealD NP = _FourDimGrid->_Nprocessors;
+  RealD volume = Ls;  
-    RealD NN = _FourDimGrid->NodeCount();
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
@@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
@@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  Compressor compressor(dag);
@@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  std::vector<std::vector<CommsRequest_t> > reqs;
  // Rely on async comms; start comms before merge of local data
  DhopCommTime-=usecond();
  st.CommunicateBegin(reqs);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+=usecond();
-  // Perhaps use omp task and region
+  double ctime=0;
-#pragma omp parallel 
+  double ptime=0;
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Ugly explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
+    int ncomms = CartesianCommunicator::nCommThreads;
-    int myoff, mywork;
+    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
      nthreads -= ncomms;
      int ttid = tid - ncomms;
      int n = U._grid->oSites();
      int chunk = n / nthreads;
      int rem = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
 	myblock = ttid * chunk + ttid;
 	myn = chunk+1;
      } else {
 	myblock = ttid*chunk + rem;
 	myn = chunk;
      }
-    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
+      // do the compute
-    int sF = LLs * myoff;
+      if (dag == DaggerYes) {
-
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-    if ( me == 0 ) {
+	  int sU = ss;
-      st.CommunicateComplete(reqs);
+	  int sF = LLs * sU;
-      DhopCommTime+=usecond();
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-    } else { 
+	}
-      // Interior links in stencil
+      } else {
-      if ( me==1 ) DhopComputeTime-=usecond();
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sU = ss;
-      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sF = LLs * sU;
-      if ( me==1 ) DhopComputeTime+=usecond();
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 	}
      }
 	ptime = usecond() - start;
    }
    {
      double start = usecond();
      st.CommunicateThreaded();
      ctime = usecond() - start;
    }
  }
  DhopCommTime += ctime;
  DhopComputeTime+=ptime;
  // First to enter, last to leave timing
  st.CollateThreads();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  // Load imbalance alert. Should use dynamic schedule OMP for loop
  // Perhaps create a list of only those sites with face work, and 
  // load balance process the list.
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
@@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 #else 
  assert(0);
 #endif
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
@@ -65,8 +65,8 @@ public:
  typedef iImplGaugeField<Simd> SiteField;
  typedef Lattice<SiteComplex> ComplexField;
-  typedef Lattice<SiteLink>  LinkField; 
+  typedef Lattice<SiteLink>    LinkField; 
-  typedef Lattice<SiteField> Field;
+  typedef Lattice<SiteField>   Field;
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
@@ -100,12 +100,17 @@ public:
  static inline Field projectForce(Field &P) { return Ta(P); }
  static inline void update_field(Field& P, Field& U, double ep){
-    for (int mu = 0; mu < Nd; mu++) {
+    //static std::chrono::duration<double> diff;
-      auto Umu = PeekIndex<LorentzIndex>(U, mu);
+
-      auto Pmu = PeekIndex<LorentzIndex>(P, mu);
+    //auto start = std::chrono::high_resolution_clock::now();
-      Umu = expMat(Pmu, ep, Nexp) * Umu;
+    parallel_for(int ss=0;ss<P._grid->oSites();ss++){
-      PokeIndex<LorentzIndex>(U, ProjectOnGroup(Umu), mu);
+      for (int mu = 0; mu < Nd; mu++) 
        U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
    }
    //auto end = std::chrono::high_resolution_clock::now();
   // diff += end - start;
   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
  }
  static inline RealD FieldSquareNorm(Field& U){
@@ -0,0 +1,286 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/gauge/Photon.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 namespace Grid{
 namespace QCD{
  template <class S>
  class QedGimpl
  {
  public:
    typedef S Simd;
    template <typename vtype>
    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
    template <typename vtype>
    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
    typedef SiteField             SiteComplex;
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
  typedef QedGimpl<vComplex> QedGimplR;
  template<class Gimpl>
  class Photon
  {
  public:
    INHERIT_GIMPL_TYPES(Gimpl);
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
  public:
    Photon(Gauge gauge, ZmScheme zmScheme);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
    void StochasticWeight(GaugeLinkField &weight);
    void StochasticField(GaugeField &out, GridParallelRNG &rng);
    void StochasticField(GaugeField &out, GridParallelRNG &rng,
                         const GaugeLinkField &weight);
  private:
    void invKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
  private:
    Gauge    gauge_;
    ZmScheme zmScheme_;
  };
  typedef Photon<QedGimplR>  PhotonR;
  template<class Gimpl>
  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
  : gauge_(gauge), zmScheme_(zmScheme)
  {}
  template<class Gimpl>
  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
  {
    FFT theFFT(in._grid);
    GaugeField in_k(in._grid);
    GaugeField prop_k(in._grid);
    theFFT.FFT_all_dim(in_k,in,FFT::forward);
    MomentumSpacePropagator(prop_k,in_k);
    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
  }
  template<class Gimpl>
  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    GaugeLinkField     kmu(grid), one(grid);
    const unsigned int nd    = grid->_ndimension;
    std::vector<int>   &l    = grid->_fdimensions;
    std::vector<int>   zm(nd,0);
    TComplex           Tone = Complex(1.0,0.0);
    TComplex           Tzero= Complex(0.0,0.0);
    one = Complex(1.0,0.0);
    out = zero;
    for(int mu = 0; mu < nd; mu++)
    {
      Real twoPiL = M_PI*2./l[mu];
      LatticeCoordinate(kmu,mu);
      kmu = 2.*sin(.5*twoPiL*kmu);
      out = out + kmu*kmu;
    }
    pokeSite(Tone, out, zm);
    out = one/out;
    pokeSite(Tzero, out, zm);
  }
  template<class Gimpl>
  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    const unsigned int nd    = grid->_ndimension;
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
        std::vector<int> zm(nd,0);
        TComplex         Tzero = Complex(0.0,0.0);
        pokeSite(Tzero, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
        LatticeInteger spNrm(grid), coor(grid);
        GaugeLinkField z(grid);
        spNrm = zero;
        for(int d = 0; d < grid->_ndimension - 1; d++)
        {
          LatticeCoordinate(coor,d);
          spNrm = spNrm + coor*coor;
        }
        out = where(spNrm == Integer(0), 0.*out, out);
        break;
      }
      default:
        break;
    }
  }
  template<class Gimpl>
  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
                                               GaugeField &out)
  {
    GridBase           *grid = out._grid;
    LatticeComplex     k2Inv(grid);
    invKHatSquared(k2Inv);
    zmSub(k2Inv);
    out = in*k2Inv;
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
  {
    auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid);
    const unsigned int nd        = grid->_ndimension;
    std::vector<int>   latt_size = grid->_fdimensions;
    Integer vol = 1;
    for(int d = 0; d < nd; d++)
    {
      vol = vol * latt_size[d];
    }
    invKHatSquared(weight);
    weight = sqrt(vol*real(weight));
    zmSub(weight);
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
    auto           *grid = dynamic_cast<GridCartesian *>(out._grid);
    GaugeLinkField weight(grid);
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
    auto               *grid = dynamic_cast<GridCartesian *>(out._grid);
    const unsigned int nd = grid->_ndimension;
    GaugeLinkField     r(grid);
    GaugeField         aTilde(grid);
    FFT                fft(grid);
    for(int mu = 0; mu < nd; mu++)
    {
      gaussian(rng, r);
      r = weight*r;
      pokeLorentz(aTilde, r, mu);
    }
    fft.FFT_all_dim(out, aTilde, FFT::backward);
    out = real(out);
  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
 //                                                            const GaugeField &in)
 //  {
 //    
 //    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 //    
 //    GridBase *grid = out._grid;
 //    LatticeInteger     coor(grid);
 //    GaugeField zz(grid); zz=zero;
 //    
 //    // xyzt
 //    for(int d = 0; d < grid->_ndimension-1;d++){
 //      LatticeCoordinate(coor,d);
 //      out = where(coor==Integer(0),zz,out);
 //    }
 //  }
 //  
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
 //                                                             const GaugeField &in)
 //  {
 //    
 //    // what type LatticeComplex
 //    GridBase *grid = out._grid;
 //    int nd = grid->_ndimension;
 //    
 //    typedef typename GaugeField::vector_type vector_type;
 //    typedef typename GaugeField::scalar_type ScalComplex;
 //    typedef Lattice<iSinglet<vector_type> > LatComplex;
 //    
 //    std::vector<int> latt_size   = grid->_fdimensions;
 //    
 //    LatComplex denom(grid); denom= zero;
 //    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
 //    LatComplex   kmu(grid);
 //    
 //    ScalComplex ci(0.0,1.0);
 //    // momphase = n * 2pi / L
 //    for(int mu=0;mu<Nd;mu++) {
 //      
 //      LatticeCoordinate(kmu,mu);
 //      
 //      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 //      
 //      kmu = TwoPiL * kmu ;
 //      
 //      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
 //    }
 //    std::vector<int> zero_mode(nd,0);
 //    TComplexD Tone = ComplexD(1.0,0.0);
 //    TComplexD Tzero= ComplexD(0.0,0.0);
 //    
 //    pokeSite(Tone,denom,zero_mode);
 //    
 //    denom= one/denom;
 //    
 //    pokeSite(Tzero,denom,zero_mode);
 //    
 //    out = zero;
 //    out = in*denom;
 //  };
 }}
 #endif
@@ -71,14 +71,18 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
    RealD factor = 0.5 * beta / RealD(Nc);
-    GaugeLinkField Umu(U._grid);
+    //GaugeLinkField Umu(U._grid);
    GaugeLinkField dSdU_mu(U._grid);
    for (int mu = 0; mu < Nd; mu++) {
-      Umu = PeekIndex<LorentzIndex>(U, mu);
+      //Umu = PeekIndex<LorentzIndex>(U, mu);
      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
      dSdU_mu = Ta(dSdU_mu) * factor;
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
@@ -31,6 +31,7 @@ directory
 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
 #include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
 namespace Grid {
 namespace QCD {
@@ -39,6 +40,10 @@ namespace QCD {
  typedef ScalarAction<ScalarImplF>                 ScalarActionF;
  typedef ScalarAction<ScalarImplD>                 ScalarActionD;
  template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
  template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
  template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
 }
 }
@@ -6,10 +6,10 @@
  Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+  Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+  Author: neo <cossu@post.kek.jp>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+  Author: paboyle <paboyle@ph.ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -36,49 +36,48 @@ directory
 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  template <class Impl>
+template <class Impl>
-  class ScalarAction : public QCD::Action<typename Impl::Field> {
+class ScalarAction : public QCD::Action<typename Impl::Field> {
-  public:
+ public:
    INHERIT_FIELD_TYPES(Impl);
-  private:
+ private:
    RealD mass_square;
    RealD lambda;
-  public:
+ public:
-    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
+    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
-    virtual std::string LogParameters(){
+    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
    }
    virtual std::string action_name() {return "ScalarAction";}
-    virtual std::string action_name(){return "ScalarAction";}
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
    virtual void refresh(const Field &U,
 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
    virtual RealD S(const Field &p) {
      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+    (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
-	ScalarObs<Impl>::sumphider(p);
+    ScalarObs<Impl>::sumphider(p);
    };
    virtual void deriv(const Field &p,
-		       Field &force) {
+                       Field &force) {
      Field tmp(p._grid);
      Field p2(p._grid);
      ScalarObs<Impl>::phisquared(p2, p);
      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+      force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
-    };
+    }
-  };
+};
-} // Grid
+
 }  // namespace Grid
 #endif // SCALAR_ACTION_H
@@ -5,18 +5,22 @@
 namespace Grid {
  //namespace QCD {
-  template <class S>
+template <class S>
-  class ScalarImplTypes {
+class ScalarImplTypes {
-  public:
+ public:
    typedef S Simd;
    template <typename vtype>
    using iImplField = iScalar<iScalar<iScalar<vtype> > >;
    typedef iImplField<Simd> SiteField;
-    
+    typedef SiteField        SitePropagator;
    typedef SiteField        SiteComplex;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
    typedef Field              FermionField;
    typedef Field              PropagatorField;
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
@@ -24,11 +28,11 @@ namespace Grid {
    static inline Field projectForce(Field& P){return P;}
-    static inline void update_field(Field& P, Field& U, double ep){
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    static inline RealD FieldSquareNorm(Field& U){
+    static inline RealD FieldSquareNorm(Field& U) {
      return (- sum(trace(U*U))/2.0);
    }
@@ -44,45 +48,91 @@ namespace Grid {
      U = 1.0;
    }
    static void MomentumSpacePropagator(Field &out, RealD m)
    {
      GridBase           *grid = out._grid;
      Field              kmu(grid), one(grid);
      const unsigned int nd    = grid->_ndimension;
      std::vector<int>   &l    = grid->_fdimensions;
      one = Complex(1.0,0.0);
      out = m*m;
      for(int mu = 0; mu < nd; mu++)
      {
        Real twoPiL = M_PI*2./l[mu];
        LatticeCoordinate(kmu,mu);
        kmu = 2.*sin(.5*twoPiL*kmu);
        out = out + kmu*kmu;
      }
      out = one/out;
    }
    static void FreePropagator(const Field &in, Field &out,
                               const Field &momKernel)
    {
      FFT   fft((GridCartesian *)in._grid);
      Field inFT(in._grid);
      fft.FFT_all_dim(inFT, in, FFT::forward);
      inFT = inFT*momKernel;
      fft.FFT_all_dim(out, inFT, FFT::backward);
    }
    static void FreePropagator(const Field &in, Field &out, RealD m)
    {
      Field momKernel(in._grid);
      MomentumSpacePropagator(momKernel, m);
      FreePropagator(in, out, momKernel);
    }
  };
  template <class S, unsigned int N>
-  class ScalarMatrixImplTypes {
+  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
    typedef QCD::SU<N> Group;
    template <typename vtype>
-    using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
+    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
    using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
-    typedef iImplField<Simd> SiteField;
+    typedef iImplField<Simd>   SiteField;
    typedef SiteField          SitePropagator;
    typedef iImplComplex<Simd> SiteComplex;
    typedef Lattice<SiteField>   Field;
    typedef Lattice<SiteComplex> ComplexField;
    typedef Field                FermionField;
    typedef Field                PropagatorField;
-    typedef Lattice<SiteField> Field;
+    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
-    
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
    }
-    static inline Field projectForce(Field& P){return P;}
+    static inline Field projectForce(Field& P) {return P;}
-    static inline void update_field(Field& P, Field& U, double ep){
+    static inline void update_field(Field& P, Field& U, double ep) {
      U += P*ep;
    }
-    static inline RealD FieldSquareNorm(Field& U){
+    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(- sum(trace(U*U))*0.5).real());
+      return (TensorRemove(sum(trace(U*U))).real());
    }
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
    }
    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      gaussian(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
    }
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-      U = 1.0;
+      U = zero;
    }
  };
@@ -93,6 +143,18 @@ namespace Grid {
  typedef ScalarImplTypes<vReal> ScalarImplR;
  typedef ScalarImplTypes<vRealF> ScalarImplF;
  typedef ScalarImplTypes<vRealD> ScalarImplD;
  typedef ScalarImplTypes<vComplex> ScalarImplCR;
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
  // Hardcoding here the size of the matrices
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
  typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
  //}
 }
@@ -6,10 +6,7 @@
  Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: paboyle <paboyle@ph.ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -30,55 +27,122 @@ directory
  *************************************************************************************/
 /*  END LEGAL */
-#ifndef SCALAR_ACTION_H
+#ifndef SCALAR_INT_ACTION_H
-#define SCALAR_ACTION_H
+#define SCALAR_INT_ACTION_H
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure
 namespace Grid {
  // FIXME drop the QCD namespace everywhere here
-  template <class Impl>
+  template <class Impl, int Ndim >
  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
  public:
    INHERIT_FIELD_TYPES(Impl);
  private:
    RealD mass_square;
    RealD lambda;
  public:
    ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
-    virtual std::string LogParameters(){
+    typedef typename Field::vector_object vobj;
    typedef CartesianStencil<vobj,vobj> Stencil;
    SimpleCompressor<vobj> compressor;
    int npoint = 2*Ndim;
    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
  public:
    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
      for (int mu = 0 ; mu < Ndim; mu++){
 		directions[mu]         = mu; directions[mu+Ndim]    = mu;
 		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
      }
    }
    virtual std::string LogParameters() {
      std::stringstream sstream;
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
      return sstream.str();
    }
-    virtual std::string action_name(){return "ScalarAction";}
+    virtual std::string action_name() {return "ScalarAction";}
-    virtual void refresh(const Field &U,
+    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms
    virtual RealD S(const Field &p) {
-      return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
+      assert(p._grid->Nd() == Ndim);
-	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-	ScalarObs<Impl>::sumphider(p);
+      phiStencil.HaloExchange(p, compressor);
      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
      phisquared = p*p;
      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
      for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 	  int permute_type;
 	  StencilEntry *SE;
 	  vobj temp2;
 	  const vobj *temp, *t_p;
 	  SE = phiStencil.GetEntry(permute_type, mu, i);
 	  t_p  = &p._odata[i];
 	  if ( SE->_is_local ) {
 	    temp = &p._odata[SE->_offset];
 	    if ( SE->_permute ) {
 	      permute(temp2, *temp, permute_type);
 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 	    } else {
 	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 	    }
 	  } else {
 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
 	  }
 	}
 	//  action -= pshift*p + p*pshift;
      }
      // NB the trace in the algebra is normalised to 1/2
      // minus sign coming from the antihermitian fields
      return -(TensorRemove(sum(trace(action)))).real();
    };
-    virtual void deriv(const Field &p,
+    virtual void deriv(const Field &p, Field &force) {
-		       Field &force) {
+      assert(p._grid->Nd() == Ndim);
-      Field tmp(p._grid);
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-      Field p2(p._grid);
+      // move this outside
-      ScalarObs<Impl>::phisquared(p2, p);
+      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
+      phiStencil.HaloExchange(p, compressor);
      for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
+      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-    };
+      for (int point = 0; point < npoint; point++) {
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 	  const vobj *temp;
 	  vobj temp2;
 	  int permute_type;
 	  StencilEntry *SE;
 	  SE = phiStencil.GetEntry(permute_type, point, i);
 	  if ( SE->_is_local ) {
 	    temp = &p._odata[SE->_offset];
 	    if ( SE->_permute ) {
 	      permute(temp2, *temp, permute_type);
 	      force._odata[i] -= temp2;
 	    } else {
 	      force._odata[i] -= *temp;
 	    }
 	  } else {
 	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
 	  }
 	}
      }
    }
  };
-} // Grid
+}  // namespace Grid
-#endif // SCALAR_ACTION_H
+#endif  // SCALAR_INT_ACTION_H
@@ -207,6 +207,12 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator,
 typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
    ScalarGenericHMCRunner;
 typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
    ScalarAdjGenericHMCRunner;
 template <int Colours> 
 using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
 }  // namespace QCD
 }  // namespace Grid
@@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
-  	std::cout << "Reading HMC\n";
+  	std::cout << GridLogMessage << "Reading HMC\n";
  	read(TheReader, "HMC", *this);
  }
@@ -165,7 +165,7 @@ class HMCResourceManager {
  // Grids
  //////////////////////////////////////////////////////////////
-  void AddGrid(std::string s, GridModule& M) {
+  void AddGrid(const std::string s, GridModule& M) {
    // Check for name clashes
    auto search = Grids.find(s);
    if (search != Grids.end()) {
@@ -174,14 +174,24 @@ class HMCResourceManager {
      exit(1);
    }
    Grids[s] = std::move(M);
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
    Grids[s].show_full_decomposition();
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
  }
  // Add a named grid set, 4d shortcut
-  void AddFourDimGrid(std::string s) {
+  void AddFourDimGrid(const std::string s) {
    GridFourDimModule<vComplex> Mod;
    AddGrid(s, Mod);
  }
  // Add a named grid set, 4d shortcut + tweak simd lanes
  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
    GridFourDimModule<vComplex> Mod(simd_decomposition);
    AddGrid(s, Mod);
  }
  GridCartesian* GetCartesian(std::string s = "") {
@@ -253,6 +263,7 @@ class HMCResourceManager {
  template<class T, class... Types>
  void AddObservable(Types&&... Args){
    ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
    ObservablesList.back()->print_parameters();
  }
  std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@@ -43,11 +43,12 @@ public:
  std::string, lattice,
  std::string, mpi);
-  std::vector<int> getLattice(){return strToVec<int>(lattice);}
+  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
-  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
+  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}
-  void check(){
+
-    if (getLattice().size() != getMpi().size()) {
+  void check() const {
    if (getLattice().size() != getMpi().size() ) {
      std::cout << GridLogError
                << "Error in GridModuleParameters: lattice and mpi dimensions "
                   "do not match"
@@ -84,6 +85,8 @@ class GridModule {
  void set_full(GridCartesian* grid) { grid_.reset(grid); }
  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
  void show_full_decomposition(){ grid_->show_decomposition(); }
  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
 protected:
  std::unique_ptr<GridCartesian> grid_;
@@ -95,31 +98,72 @@ class GridModule {
 // Classes for the user
 ////////////////////////////////////
 // Note: the space time grid should be out of the QCD namespace
-template< class vector_type>
+template <class vector_type>
-class GridFourDimModule : public GridModule {
+class GridFourDimModule : public GridModule
- public:
+{
-  GridFourDimModule() {
+public:
  GridFourDimModule()
  {
    using namespace QCD;
    set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
+        GridDefaultLatt(), 
        GridDefaultSimd(4, vector_type::Nsimd()),
        GridDefaultMpi()));
    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
  }
-  GridFourDimModule(GridModuleParameters Params) {
+  GridFourDimModule(const std::vector<int> tweak_simd)
  {
    using namespace QCD;
    if (tweak_simd.size() != 4)
    {
      std::cout << GridLogError
                << "Error in GridFourDimModule: SIMD size different from 4" 
                << std::endl;
      exit(1);
    }
    // Checks that the product agrees with the expectation
    int simd_sum = 1;
    for (auto &n : tweak_simd)
      simd_sum *= n;
    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
    if (simd_sum == vector_type::Nsimd())
    {
      set_full(SpaceTimeGrid::makeFourDimGrid(
          GridDefaultLatt(), 
          tweak_simd, 
          GridDefaultMpi()));
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
    }
    else
    {
      std::cout << GridLogError 
                << "Error in GridFourDimModule: SIMD lanes must sum to " 
                << vector_type::Nsimd() 
                << std::endl;
    }
  }
  GridFourDimModule(const GridModuleParameters Params)
  {
    using namespace QCD;
    Params.check();
    std::vector<int> lattice_v = Params.getLattice();
    std::vector<int> mpi_v = Params.getMpi();
-    if (lattice_v.size() == 4) {
+    if (lattice_v.size() == 4)
    {
      set_full(SpaceTimeGrid::makeFourDimGrid(
-          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
+          lattice_v, 
          GridDefaultSimd(4, vector_type::Nsimd()),
          mpi_v));
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    } else {
+    }
    else
    {
      std::cout << GridLogError
-          << "Error in GridFourDimModule: lattice dimension different from 4"
+                << "Error in GridFourDimModule: lattice dimension different from 4"
-          << std::endl;
+                << std::endl;
      exit(1);
    }
  }
@@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
-    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.readConfiguration(U,header);  // format from the header
    _IldgReader.close();
    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors
  // acquire resource
  virtual void initialize(){
    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };
 template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
+class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
+  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors
  // acquire resource
  virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
+    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
  }
  public:
-  TopologicalChargeMod(): ObsBase(NoParameters()){}
+  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
  TopologicalChargeMod(): ObsBase(){}
 };
 }// QCD temporarily here
@@ -33,9 +33,45 @@ directory
 namespace Grid {
 namespace QCD {
 struct TopologySmearingParameters : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
    int, steps,
    float, step_size,
    int, meas_interval,
    float, maxTau);
    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
    template < class ReaderClass >
    TopologySmearingParameters(Reader<ReaderClass>& Reader){
        read(Reader, "Smearing", *this);  
    }  
 };
 struct TopologyObsParameters : Serializable {
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
      int, interval,
      bool, do_smearing,
      TopologySmearingParameters, Smearing);  
    TopologyObsParameters(int interval = 1, bool smearing = false):
        interval(interval), Smearing(smearing){}
    template <class ReaderClass >
      TopologyObsParameters(Reader<ReaderClass>& Reader){
        read(Reader, "TopologyMeasurement", *this);
  }
 };
 // this is only defined for a gauge theory
 template <class Impl>
 class TopologicalCharge : public HmcObservable<typename Impl::Field> {
    TopologyObsParameters Pars;
 public:
    // here forces the Impl to be of gauge fields
    // if not the compiler will complain
@@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
    // necessary for HmcObservable compatibility
    typedef typename Impl::Field Field;
    TopologicalCharge(int interval = 1, bool do_smearing = false):
        Pars(interval, do_smearing){}
    TopologicalCharge(TopologyObsParameters P):Pars(P){
        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
    }
    void TrajectoryComplete(int traj,
                            Field &U,
                            GridSerialRNG &sRNG,
                            GridParallelRNG &pRNG) {
-    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
+    if (traj%Pars.interval == 0){
        // Smearing
        Field Usmear = U;
        int def_prec = std::cout.precision();
-    int def_prec = std::cout.precision();
+        if (Pars.do_smearing){
            // using wilson flow by default here
            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
            Real T0   = WF.energyDensityPlaquette(Usmear);
            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
        }
-    std::cout << GridLogMessage
+        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+        std::cout << GridLogMessage
-        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-    std::cout.precision(def_prec);
+        std::cout.precision(def_prec);
        }
    }
 };
@@ -62,7 +62,10 @@ class Representations {
 typedef Representations<FundamentalRepresentation> NoHirep;
 typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields;
-  //typedef Representations<EmptyRep<typename ScalarMatrixImplR::Field> > ScalarMatrixFields;
+typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields;
 template < int Colours> 
 using ScalarNxNMatrixFields = Representations<EmptyRep<typename ScalarNxNAdjImplR<Colours>::Field> >;
 // Helper classes to access the elements
 // Strips the first N parameters from the tuple
@@ -58,6 +58,8 @@ class Smear_Stout : public Smear<Gimpl> {
    SmearBase->smear(C, U);
  };
  // Repetion of code here (use the Tensor_exp.h function)
  void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const {
    // Put this outside
    // only valid for SU(3) matrices
@@ -36,20 +36,23 @@ namespace QCD {
 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
    unsigned int Nstep;
-    RealD epsilon;
+    unsigned int measure_interval;
    mutable RealD epsilon, taus;
    mutable WilsonGaugeAction<Gimpl> SG;
    void evolve_step(typename Gimpl::GaugeField&) const;
    void evolve_step_adaptive(typename Gimpl::GaugeField&, RealD);
    RealD tau(unsigned int t)const {return epsilon*(t+1.0); }
 public:
    INHERIT_GIMPL_TYPES(Gimpl)
-    explicit WilsonFlow(unsigned int Nstep, RealD epsilon):
+    explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
        Nstep(Nstep),
        epsilon(epsilon),
        measure_interval(interval),
        SG(WilsonGaugeAction<Gimpl>(3.0)) {
            // WilsonGaugeAction with beta 3.0
            assert(epsilon > 0.0);
@@ -72,7 +75,9 @@ class WilsonFlow: public Smear<Gimpl>{
        // undefined for WilsonFlow
    }
    void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau);
    RealD energyDensityPlaquette(unsigned int step, const GaugeField& U) const;
    RealD energyDensityPlaquette(const GaugeField& U) const;
 };
@@ -98,23 +103,110 @@ void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U) const{
    Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
 }
 template <class Gimpl>
 void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD maxTau) {
    if (maxTau - taus < epsilon){
        epsilon = maxTau-taus;
    }
    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
    GaugeField Z(U._grid);
    GaugeField Zprime(U._grid);
    GaugeField tmp(U._grid), Uprime(U._grid);
    Uprime = U;
    SG.deriv(U, Z);
    Zprime = -Z;
    Z *= 0.25;                                  // Z0 = 1/4 * F(U)
    Gimpl::update_field(Z, U, -2.0*epsilon);    // U = W1 = exp(ep*Z0)*W0
    Z *= -17.0/8.0;
    SG.deriv(U, tmp); Z += tmp;                 // -17/32*Z0 +Z1
    Zprime += 2.0*tmp;
    Z *= 8.0/9.0;                               // Z = -17/36*Z0 +8/9*Z1
    Gimpl::update_field(Z, U, -2.0*epsilon);    // U_= W2 = exp(ep*Z)*W1
    Z *= -4.0/3.0;
    SG.deriv(U, tmp); Z += tmp;                 // 4/3*(17/36*Z0 -8/9*Z1) +Z2
    Z *= 3.0/4.0;                               // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
    Gimpl::update_field(Z, U, -2.0*epsilon);    // V(t+e) = exp(ep*Z)*W2
    // Ramos 
    Gimpl::update_field(Zprime, Uprime, -2.0*epsilon); // V'(t+e) = exp(ep*Z')*W0
    // Compute distance as norm^2 of the difference
    GaugeField diffU = U - Uprime;
    RealD diff = norm2(diffU);
    // adjust integration step
    taus += epsilon;
    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 }
 template <class Gimpl>
 RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeField& U) const {
    RealD td = tau(step);
    return 2.0 * td * td * SG.S(U)/U._grid->gSites();
 }
 template <class Gimpl>
 RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const GaugeField& U) const {
    return 2.0 * taus * taus * SG.S(U)/U._grid->gSites();
 }
 //#define WF_TIMING 
 template <class Gimpl>
 void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
    out = in;
-    for (unsigned int step = 0; step < Nstep; step++) {
+    for (unsigned int step = 1; step <= Nstep; step++) {
        auto start = std::chrono::high_resolution_clock::now();
        evolve_step(out);
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> diff = end - start;
        #ifdef WF_TIMING
        std::cout << "Time to evolve " << diff.count() << " s\n";
        #endif
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-            << step << " "
+            << step << "  "
            << energyDensityPlaquette(step,out) << std::endl;
         if( step % measure_interval == 0){
         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
            << step << "  " 
            << WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
        }
    }
 }
 template <class Gimpl>
 void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau){
    out = in;
    taus = epsilon;
    unsigned int step = 0;
    do{
        step++;
        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
        evolve_step_adaptive(out, maxTau);
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
            << step << "  "
            << energyDensityPlaquette(out) << std::endl;
         if( step % measure_interval == 0){
         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
            << step << "  " 
            << WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
        }
    } while (taus < maxTau);
 }
 }  // namespace QCD
 }  // namespace Grid
@@ -0,0 +1,193 @@
    /*************************************************************************************
    grid` physics library, www.github.com/paboyle/Grid 
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //#include <Grid/Grid.h>
 #ifndef GRID_QCD_GAUGE_FIX_H
 #define GRID_QCD_GAUGE_FIX_H
 namespace Grid {
 namespace QCD {
 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
    for(int mu=0;mu<Nd;mu++){
      Complex cmi(0.0,-1.0);
      A[mu] = Ta(U[mu]) * cmi;
    }
  }
  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
    dmuAmu=zero;
    for(int mu=0;mu<Nd;mu++){
      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
    }
  }  
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
    GridBase *grid = Umu._grid;
    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
    Real old_trace = org_link_trace;
    Real trG;
    std::vector<GaugeMat> U(Nd,grid);
                 GaugeMat dmuAmu(grid);
    for(int i=0;i<maxiter;i++){
      for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
      if ( Fourier==false ) { 
 	trG = SteepestDescentStep(U,alpha,dmuAmu);
      } else { 
 	trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
      }
      for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
      // Monitor progress and convergence test 
      // infrequently to minimise cost overhead
      if ( i %20 == 0 ) { 
 	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
 	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
 	if (Fourier) 
 	  std::cout << GridLogMessage << "Fourier Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	else 
 	  std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	Real Phi  = 1.0 - old_trace / link_trace ;
 	Real Omega= 1.0 - trG;
 	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
 	if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
 	  std::cout << GridLogMessage << "Converged ! "<<std::endl;
 	  return;
 	}
 	old_trace = link_trace;
      }
    }
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
    GridBase *grid = U[0]._grid;
    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
    GaugeLinkToLieAlgebraField(U,A);
    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
    Real vol = grid->gSites();
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    SU<Nc>::GaugeTransform(U,g);
    return trG;
  }
  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
    GridBase *grid = U[0]._grid;
    Real vol = grid->gSites();
    FFT theFFT((GridCartesian *)grid);
    LatticeComplex  Fp(grid);
    LatticeComplex  psq(grid); psq=zero;
    LatticeComplex  pmu(grid); 
    LatticeComplex   one(grid); one = Complex(1.0,0.0);
    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
    std::vector<GaugeMat> A(Nd,grid);
    GaugeLinkToLieAlgebraField(U,A);
    DmuAmu(A,dmuAmu);
    theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
    //////////////////////////////////
    // Work out Fp = psq_max/ psq...
    //////////////////////////////////
    std::vector<int> latt_size = grid->GlobalDimensions();
    std::vector<int> coor(grid->_ndimension,0);
    for(int mu=0;mu<Nd;mu++) {
      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
      LatticeCoordinate(pmu,mu);
      pmu = TwoPiL * pmu ;
      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
    }
    Complex psqMax(16.0);
    Fp =  psqMax*one/psq;
    /*
    static int once;
    if ( once == 0 ) { 
      std::cout << " Fp " << Fp <<std::endl;
      once ++;
      }*/
    pokeSite(TComplex(1.0),Fp,coor);
    dmuAmu_p  = dmuAmu_p * Fp; 
    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
    GaugeMat ciadmam(grid);
    Complex cialpha(0.0,-alpha);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
    SU<Nc>::GaugeTransform(U,g);
    return trG;
  }
  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
    GridBase *grid = g._grid;
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
    DmuAmu(A,dmuAmu);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);
  }  
 };
 }
 }
 #endif
@@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
    for (int a = 0; a < AdjointDimension; a++) {
      generator(a, Ta);
-      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
+      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
      pokeColour(h_out, tmp, a);
    }
  }
@@ -188,6 +188,32 @@ public:
    }
  }
 // For the force term
 static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    GridBase *grid = Umu._grid;
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      // this operation is taking too much time
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = zero;
    GaugeMat tmp1(grid);
    GaugeMat tmp2(grid);
    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
        // this is ~10% faster than the Staple
        tmp1 = Cshift(U[nu], mu, 1);
        tmp2 = Cshift(U[mu], nu, 1);
        staple += tmp1* adj(U[nu]*tmp2);
        tmp2 = adj(U[mu]*tmp1)*U[nu];
        staple += Cshift(tmp2, nu, -1);
      }
    }
    staple = U[mu]*staple;
 }
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
@@ -200,7 +226,6 @@ public:
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = zero;
    GaugeMat tmp(grid);
    for (int nu = 0; nu < Nd; nu++) {
@@ -227,6 +252,7 @@ public:
        // |__
        //
        //
        staple += Gimpl::ShiftStaple(
            Gimpl::CovShiftBackward(U[nu], nu,
                                    Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
@@ -289,8 +315,7 @@ public:
      //
      staple = Gimpl::ShiftStaple(
          Gimpl::CovShiftBackward(U[nu], nu,
-                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
          mu);
    }
  }
@@ -307,10 +332,10 @@ public:
      GaugeMat Vup(Umu._grid), Vdn(Umu._grid);
      StapleUpper(Vup, Umu, mu, nu);
      StapleLower(Vdn, Umu, mu, nu);
-      GaugeMat v = adj(Vup) - adj(Vdn);
+      GaugeMat v = Vup - Vdn;
      GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
      GaugeMat vu = v*u;
-      FS = 0.25*Ta(u*v + Cshift(vu, mu, +1));
+      FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
  }
  static Real TopologicalCharge(GaugeLorentz &U){
--- a/Show More
+++ b/Show More
`@@ -1,4 +1,4 @@`
	`]#!/usr/bin/env bash`	`#!/usr/bin/env bash`

	`EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'`	`EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'`