Minor changes

Fixing a compilation error
Added laplacian operator for smearing sources
2025-08-17 11:41:53 +01:00 · 2017-10-09 09:44:03 +01:00 · 2017-10-04 14:29:01 +01:00 · 2017-10-04 13:54:54 +01:00
734 changed files with 17439 additions and 68608 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -83,7 +83,6 @@ ltmain.sh
 .Trashes
 ehthumbs.db
 Thumbs.db
 .dirstamp
 # build directory #
 ###################
@@ -94,12 +93,14 @@ build*/*
 *.xcodeproj/*
 build.sh
 .vscode
 *.code-workspace
 # Eigen source #
 ################
-Grid/Eigen
+lib/Eigen/*
-Eigen/*
+
 # FFTW source #
 ################
 lib/fftw/*
 # libtool macros #
 ##################
@@ -110,8 +111,15 @@ m4/libtool.m4
 ################
 gh-pages/
 # Buck files #
 ##############
 .buck*
 buck-out
 BUCK
 make-bin-BUCK.sh
 # generated sources #
 #####################
-Grid/qcd/spin/gamma-gen/*.h
+lib/qcd/spin/gamma-gen/*.h
-Grid/qcd/spin/gamma-gen/*.cc
+lib/qcd/spin/gamma-gen/*.cc
-Grid/util/Version.h
+
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,11 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=single
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
      env: PREC=double
 before_install:
    - export GRIDDIR=`pwd`
@@ -21,11 +16,9 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
 install:
    - export CWD=`pwd`
    - echo $CWD
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
@@ -38,24 +31,16 @@ install:
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
 script:
    - ./bootstrap.sh
    - mkdir build
    - cd build
-    - mkdir lime
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - cd lime
    - mkdir build
    - cd build
    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
    - tar xf lime-1.3.2.tar.gz
    - cd lime-1.3.2
    - ./configure --prefix=$CWD/build/lime/install
    - make -j4
    - make install
    - cd $CWD/build
    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - echo make clean
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
--- a/5
+++ b/5
@@ -1,5 +0,0 @@
 Version : 0.8.0
 - Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
 - MPI and MPI3 comms optimisations for KNL and OPA finished
 - Half precision comms
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -1,63 +0,0 @@
 extra_sources=
 extra_headers=
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryMPI.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
  extra_sources+=communicator/SharedMemoryNone.cc
  extra_sources+=communicator/SharedMemory.cc
 endif
 if BUILD_HDF5
  extra_sources+=serialisation/Hdf5IO.cc 
  extra_headers+=serialisation/Hdf5IO.h
  extra_headers+=serialisation/Hdf5Type.h
 endif
 all: version-cache
 version-cache:
 	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
 		a="uncommited changes";\
 	else\
 		a="clean";\
 	fi;\
 	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
 	if [ -e version-cache ]; then\
 		d=`diff vertmp version-cache`;\
 		if [ "$${d}" != "" ]; then\
 			mv vertmp version-cache;\
 			rm -f Version.h;\
 		fi;\
 	else\
 		mv vertmp version-cache;\
 		rm -f Version.h;\
 	fi;\
 	rm -f vertmp
 Version.h:
 	cp version-cache Version.h
 .PHONY: version-cache
 #
 # Libraries
 #
 include Make.inc
 include Eigen.inc
 lib_LIBRARIES = libGrid.a
 CCFILES += $(extra_sources)
 HFILES  += $(extra_headers) Config.h Version.h
 libGrid_a_SOURCES              = $(CCFILES)
 libGrid_adir                   = $(includedir)/Grid
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)
--- a/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
@@ -1,244 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the CAGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  CommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
                                                  Integer maxit,
                                                  Integer restart_length,
                                                  bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // this should probably be made a class member so that it is only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(v, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
    MatrixTimer.Start();
    LinOp.Op(v[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + v[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -1,108 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DEFLATION_H
 #define GRID_DEFLATION_H
 namespace Grid { 
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = zero; };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
 ////////////////////////////////
 // Fine grid deflation
 ////////////////////////////////
 template<class Field>
 class DeflatedGuesser: public LinearFunction<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
 public:
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
  virtual void operator()(const Field &src,Field &guess) {
    guess = zero;
    assert(evec.size()==eval.size());
    auto N = evec.size();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
    guess.checkerboard = src.checkerboard;
  }
 };
 template<class FineField, class CoarseField>
 class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
 private:
  const std::vector<FineField>   &subspace;
  const std::vector<CoarseField> &evec_coarse;
  const std::vector<RealD>       &eval_coarse;
 public:
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
    : subspace(_subspace), 
      evec_coarse(_evec_coarse), 
      eval_coarse(_eval_coarse)  
  {
  }
  void operator()(const FineField &src,FineField &guess) { 
    int N = (int)evec_coarse.size();
    CoarseField src_coarse(evec_coarse[0]._grid);
    CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero;
    blockProject(src_coarse,src,subspace);    
    for (int i=0;i<N;i++) {
      const CoarseField & tmp = evec_coarse[i];
      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
    }
    blockPromote(guess_coarse,guess,subspace);
    guess.checkerboard = src.checkerboard;
  };
 };
 }
 #endif
--- a/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
@@ -1,256 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  LinearFunction<Field> &Preconditioner;
  FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD   tol,
                                                          Integer maxit,
                                                          LinearFunction<Field> &Prec,
                                                          Integer restart_length,
                                                          bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
    PrecTimer.Start();
    Preconditioner(v[iter], z[iter]);
    PrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
@@ -1,254 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the FGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  LinearFunction<Field> &Preconditioner;
  FlexibleGeneralisedMinimalResidual(RealD   tol,
                                     Integer maxit,
                                     LinearFunction<Field> &Prec,
                                     Integer restart_length,
                                     bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Precon  " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
    PrecTimer.Start();
    Preconditioner(v[iter], z[iter]);
    PrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/GeneralisedMinimalResidual.h
@@ -1,242 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field>
 class GeneralisedMinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the GMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  GeneralisedMinimalResidual(RealD   tol,
                             Integer maxit,
                             Integer restart_length,
                             bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.) {};
  void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Field r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "GeneralisedMinimalResidual:   src " << ssq   << std::endl;
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Total   " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Matrix  " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: Linalg  " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: QR      " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
    RealD cp = 0;
    Field w(src._grid);
    Field r(src._grid);
    // this should probably be made a class member so that it is only allocated once, not in every restart
    std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(v, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
    MatrixTimer.Start();
    LinOp.Op(v[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + v[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -1,842 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 Author: Christoph Lehner <clehner@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_BIRL_H
 #define GRID_BIRL_H
 #include <string.h> //memset
 //#include <zlib.h>
 #include <sys/stat.h>
 namespace Grid { 
  ////////////////////////////////////////////////////////
  // Move following 100 LOC to lattice/Lattice_basis.h
  ////////////////////////////////////////////////////////
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
  }
 }
 template<class Field>
 void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  parallel_region
  {
    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
      for(int j=j0; j<j1; ++j) B[j]=0.;
      for(int j=j0; j<j1; ++j){
 	for(int k=k0; k<k1; ++k){
 	  B[j] +=Qt(j,k) * basis[k]._odata[ss];
 	}
      }
      for(int j=j0; j<j1; ++j){
 	  basis[j]._odata[ss] = B[j];
      }
    }
  }
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0]._grid;
  result.checkerboard = basis[0].checkerboard;
  parallel_for(int ss=0;ss < grid->oSites();ss++){
    vobj B = zero;
    for(int k=k0; k<k1; ++k){
      B +=Qt(j,k) * basis[k]._odata[ss];
    }
    result._odata[ss] = B;
  }
 }
 template<class Field>
 void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
 {
  int vlen = idx.size();
  assert(vlen>=1);
  assert(vlen<=sort_vals.size());
  assert(vlen<=_v.size());
  for (size_t i=0;i<vlen;i++) {
    if (idx[i] != i) {
      //////////////////////////////////////
      // idx[i] is a table of desired sources giving a permutation.
      // Swap v[i] with v[idx[i]].
      // Find  j>i for which _vnew[j] = _vold[i],
      // track the move idx[j] => idx[i]
      // track the move idx[i] => i
      //////////////////////////////////////
      size_t j;
      for (j=i;j<idx.size();j++)
 	if (idx[j]==i)
 	  break;
      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
      std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
      std::swap(sort_vals[i],sort_vals[idx[i]]);
      idx[j] = idx[i];
      idx[i] = i;
    }
  }
 }
 inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
 {
  std::vector<int> idx(sort_vals.size());
  std::iota(idx.begin(), idx.end(), 0);
  // sort indexes based on comparing values in v
  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
  });
  return idx;
 }
 template<class Field>
 void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
 {
  std::vector<int> idx = basisSortGetIndex(sort_vals);
  if (reverse)
    std::reverse(idx.begin(), idx.end());
  basisReorderInPlace(_v,sort_vals,idx);
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> class ImplicitlyRestartedLanczosTester 
 {
 public:
  virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
  virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
 };
 enum IRLdiagonalisation { 
  IRLdiagonaliseWithDSTEGR,
  IRLdiagonaliseWithQR,
  IRLdiagonaliseWithEigen
 };
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
  int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    return TestConvergence(j,resid,B,eval,evalMaxApprox);
  }
  int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
  {
    Field v(B);
    RealD eval_poly = eval;
    // Apply operator
    _HermOp(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
 };
 template<class Field> 
 class ImplicitlyRestartedLanczos {
 private:
  const RealD small = 1.0e-8;
  int MaxIter;
  int MinRestart; // Minimum number of restarts; only check for convergence after
  int Nstop;   // Number of evecs checked for convergence
  int Nk;      // Number of converged sought
  //  int Np;      // Np -- Number of spare vecs in krylov space //  == Nm - Nk
  int Nm;      // Nm -- total number of vectors
  IRLdiagonalisation diagonalisation;
  int orth_period;
  RealD OrthoTime;
  RealD eresid, betastp;
  ////////////////////////////////
  // Embedded objects
  ////////////////////////////////
  LinearFunction<Field>       &_PolyOp;
  LinearFunction<Field>       &_HermOp;
  ImplicitlyRestartedLanczosTester<Field> &_Tester;
  // Default tester provided (we need a ref to something in default case)
  ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
  /////////////////////////
  // Constructor
  /////////////////////////
 public:       
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
  // Too many options  & knobs. 
  // Eliminate:
  //   orth_period
  //   betastp
  //   MinRestart
  //
  // Do we really need orth_period
  // What is the theoretical basis & guarantees of betastp ?
  // Nstop=Nk viable?
  // MinRestart avoidable with new convergence test?
  // Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
  // HermOp could be eliminated if we dropped the Power method for max eval.
  // -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
  //////////////////////////////////////////////////////////////////
 ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
 			    LinearFunction<Field> & HermOp,
 			    ImplicitlyRestartedLanczosTester<Field> & Tester,
 			    int _Nstop, // sought vecs
 			    int _Nk, // sought vecs
 			    int _Nm, // spare vecs
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _MaxIter, // Max iterations
 			    RealD _betastp=0.0, // if beta(k) < betastp: converged
 			    int _MinRestart=1, int _orth_period = 1,
 			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp), _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(Tester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
 			       LinearFunction<Field> & HermOp,
 			       int _Nstop, // sought vecs
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _MaxIter, // Max iterations
 			       RealD _betastp=0.0, // if beta(k) < betastp: converged
 			       int _MinRestart=1, int _orth_period = 1,
 			       IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
    SimpleTester(HermOp),  _PolyOp(PolyOp),      _HermOp(HermOp), _Tester(SimpleTester),
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
  ////////////////////////////////
  // Helpers
  ////////////////////////////////
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  void orthogonalize(Field& w, std::vector<Field>& evec,int k)
  {
    OrthoTime-=usecond()/1e6;
    basisOrthogonalize(evec,w,k);
    normalise(w);
    OrthoTime+=usecond()/1e6;
  }
 /* Rudy Arthur's thesis pp.137
 ------------------------
 Require: M > K P = M − K †
 Compute the factorization AVM = VM HM + fM eM 
 repeat
  Q=I
  for i = 1,...,P do
    QiRi =HM −θiI Q = QQi
    H M = Q †i H M Q i
  end for
  βK =HM(K+1,K) σK =Q(M,K)
  r=vK+1βK +rσK
  VK =VM(1:M)Q(1:M,1:K)
  HK =HM(1:K,1:K)
  →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
 until convergence
 */
  void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
  {
    GridBase *grid = src._grid;
    assert(grid == evec[0]._grid);
    GridLogIRL.TimingMode(1);
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
    std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
    std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
      std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
      std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
      std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    assert(Nm <= evec.size() && Nm <= eval.size());
    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
    RealD evalMaxApprox = 0.0;
    {
      auto src_n = src;
      auto tmp = src;
      const int _MAX_ITER_IRL_MEVAPP_ = 50;
      for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
 	normalise(src_n);
 	_HermOp(src_n,tmp);
 	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
 	RealD vden = norm2(src_n);
 	RealD na = vnum/vden;
 	if (fabs(evalMaxApprox/na - 1.0) < 0.05)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
 	std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
 	src_n = tmp;
      }
    }
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
    std::vector<RealD> eval2(Nm);
    std::vector<RealD> eval2_copy(Nm);
    Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
    Field f(grid);
    Field v(grid);
    int k1 = 1;
    int k2 = Nk;
    RealD beta_k;
    Nconv = 0;
    // Set initial vector
    evec[0] = src;
    normalise(evec[0]);
    // Initial Nk steps
    OrthoTime=0.;
    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
    std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
    std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
    //////////////////////////////////
    // Restarting loop begins
    //////////////////////////////////
    int iter;
    for(iter = 0; iter<MaxIter; ++iter){
      OrthoTime=0.;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
      std::cout<< GridLogMessage <<" **********************"<< std::endl;
      std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
      f *= lme[Nm-1];
      std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
      std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
      //////////////////////////////////
      // getting eigenvalues
      //////////////////////////////////
      for(int k=0; k<Nm; ++k){
 	eval2[k] = eval[k+k1-1];
 	lme2[k] = lme[k+k1-1];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
      //////////////////////////////////
      // sorting
      //////////////////////////////////
      eval2_copy = eval2;
      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
      const int chunk=8;
      for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
 	for(int ii=0;ii<chunk;ii++){
 	  if ( (io+ii)<k2 )
 	    std::cout<< " "<< std::setw(12)<< eval2[io+ii];
 	}
 	std::cout << std::endl;
      }
      //////////////////////////////////
      // Implicitly shifted QR transformations
      //////////////////////////////////
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      for(int ip=k2; ip<Nm; ++ip){ 
 	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
      }
      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
      assert(k2<Nm);      assert(k2<Nm);      assert(k1>0);
      basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
      std::cout<<GridLogIRL <<"basisRotated  by Qt"<<std::endl;
      ////////////////////////////////////////////////////
      // Compressed vector f and beta(k2)
      ////////////////////////////////////////////////////
      f *= Qt(k2-1,Nm-1);
      f += lme[k2-1] * evec[k2];
      beta_k = norm2(f);
      beta_k = sqrt(beta_k);
      std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
      RealD betar = 1.0/beta_k;
      evec[k2] = betar * f;
      lme[k2-1] = beta_k;
      ////////////////////////////////////////////////////
      // Convergence test
      ////////////////////////////////////////////////////
      for(int k=0; k<Nm; ++k){    
 	eval2[k] = eval[k];
 	lme2[k] = lme[k];
      }
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
      std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
      Nconv = 0;
      if (iter >= MinRestart) {
 	std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
 	Field B(grid); B.checkerboard = evec[0].checkerboard;
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
 	int allconv =1;
 	for(int jj = 1; jj<=Nstop; jj*=2){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
 	    allconv=0;
 	  }
 	}
 	// Do evec[0] for good measure
 	{ 
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
 	}
 	if ( allconv ) Nconv = Nstop;
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){
 	if( Nconv>=Nstop){
 	  goto converged;
 	}
      } else {
 	std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
      } // end of iter loop
    }
    std::cout<<GridLogError<<"\n NOT converged.\n";
    abort();
  converged:
    {
      Field B(grid); B.checkerboard = evec[0].checkerboard;
      basisRotate(evec,Qt,0,Nk,0,Nk,Nm);	    
      std::cout << GridLogIRL << " Rotated basis"<<std::endl;
      Nconv=0;
      //////////////////////////////////////////////////////////////////////
      // Full final convergence test; unconditionally applied
      //////////////////////////////////////////////////////////////////////
      for(int j = 0; j<=Nk; j++){
 	B=evec[j];
 	if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
 	  Nconv++;
 	}
      }
      if ( Nconv < Nstop )
 	std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
      eval=eval2;
      //Keep only converged
      eval.resize(Nconv);// Nstop?
      evec.resize(Nconv,grid);// Nstop?
      basisSortInPlace(evec,eval,reverse);
    }
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
    std::cout << GridLogIRL << " -- Iterations  = "<< iter   << "\n";
    std::cout << GridLogIRL << " -- beta(k)     = "<< beta_k << "\n";
    std::cout << GridLogIRL << " -- Nconv       = "<< Nconv  << "\n";
    std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
  }
 private:
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
 2. For k = 1,2,...,m Do:
 3. wk:=Avk−βkv_{k−1}      
 4. αk:=(wk,vk)       // 
 5. wk:=wk−αkvk       // wk orthog vk 
 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
 7. vk+1 := wk/βk+1
 8. EndDo
 */
  void step(std::vector<RealD>& lmd,
 	    std::vector<RealD>& lme, 
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
    const RealD tiny = 1.0e-20;
    assert( k< Nm );
    GridStopWatch gsw_op,gsw_o;
    Field& evec_k = evec[k];
    _PolyOp(evec_k,w);    std::cout<<GridLogIRL << "PolyOp" <<std::endl;
    if(k>0) w -= lme[k-1] * evec[k-1];
    ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
    RealD     alph = real(zalph);
    w = w - alph * evec_k;// 5. wk:=wk−αkvk
    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
    // 7. vk+1 := wk/βk+1
    lmd[k] = alph;
    lme[k] = beta;
    if (k>0 && k % orth_period == 0) {
      orthogonalize(w,evec,k); // orthonormalise
      std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
    }
    if(k < Nm-1) evec[k+1] = w;
    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
  }
  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 			 int Nk, int Nm,  
 			 Eigen::MatrixXd & Qt, // Nm x Nm
 			 GridBase *grid)
  {
    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
    for (int i = 0; i < Nk; i++) {
      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
    }
    for (int i = 0; i < Nk; i++) {
      for (int j = 0; j < Nk; j++) {
 	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
      }
    }
  }
  ///////////////////////////////////////////////////////////////////////////
  // File could end here if settle on Eigen ??? !!!
  ///////////////////////////////////////////////////////////////////////////
  void QR_decomp(std::vector<RealD>& lmd,   // Nm 
 		 std::vector<RealD>& lme,   // Nm 
 		 int Nk, int Nm,            // Nk, Nm
 		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
 		 RealD Dsh, int kmin, int kmax)
  {
    int k = kmin-1;
    RealD x;
    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
    RealD c = ( lmd[k] -Dsh) *Fden;
    RealD s = -lme[k] *Fden;
    RealD tmpa1 = lmd[k];
    RealD tmpa2 = lmd[k+1];
    RealD tmpb  = lme[k];
    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
    x        =-s*lme[k+1];
    lme[k+1] = c*lme[k+1];
    for(int i=0; i<Nk; ++i){
      RealD Qtmp1 = Qt(k,i);
      RealD Qtmp2 = Qt(k+1,i);
      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
    }
    // Givens transformations
    for(int k = kmin; k < kmax-1; ++k){
      RealD Fden = 1.0/hypot(x,lme[k-1]);
      RealD c = lme[k-1]*Fden;
      RealD s = - x*Fden;
      RealD tmpa1 = lmd[k];
      RealD tmpa2 = lmd[k+1];
      RealD tmpb  = lme[k];
      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
      lme[k-1] = c*lme[k-1] -s*x;
      if(k != kmax-2){
 	x = -s*lme[k+1];
 	lme[k+1] = c*lme[k+1];
      }
      for(int i=0; i<Nk; ++i){
 	RealD Qtmp1 = Qt(k,i);
 	RealD Qtmp2 = Qt(k+1,i);
 	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
 	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
      }
    }
  }
  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 		   int Nk, int Nm,   
 		   Eigen::MatrixXd & Qt,
 		   GridBase *grid)
  {
    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
    } else { 
      assert(0);
    }
  }
 #ifdef USE_LAPACK
 void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                   double *vl, double *vu, int *il, int *iu, double *abstol,
                   int *m, double *w, double *z, int *ldz, int *isuppz,
                   double *work, int *lwork, int *iwork, int *liwork,
                   int *info);
 #endif
 void diagonalize_lapack(std::vector<RealD>& lmd,
 			std::vector<RealD>& lme, 
 			int Nk, int Nm,  
 			Eigen::MatrixXd& Qt,
 			GridBase *grid)
 {
 #ifdef USE_LAPACK
  const int size = Nm;
  int NN = Nk;
  double evals_tmp[NN];
  double evec_tmp[NN][NN];
  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
  double DD[NN];
  double EE[NN];
  for (int i = 0; i< NN; i++) {
    for (int j = i - 1; j <= i + 1; j++) {
      if ( j < NN && j >= 0 ) {
 	if (i==j) DD[i] = lmd[i];
 	if (i==j) evals_tmp[i] = lmd[i];
 	if (j==(i-1)) EE[j] = lme[j];
      }
    }
  }
  int evals_found;
  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
  int liwork =  3+NN*10 ;
  int iwork[liwork];
  double work[lwork];
  int isuppz[2*NN];
  char jobz = 'V'; // calculate evals & evecs
  char range = 'I'; // calculate all evals
  //    char range = 'A'; // calculate all evals
  char uplo = 'U'; // refer to upper half of original matrix
  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
  int ifail[NN];
  int info;
  int total = grid->_Nprocessors;
  int node  = grid->_processor;
  int interval = (NN/total)+1;
  double vl = 0.0, vu = 0.0;
  int il = interval*node+1 , iu = interval*(node+1);
  if (iu > NN)  iu=NN;
  double tol = 0.0;
  if (1) {
    memset(evals_tmp,0,sizeof(double)*NN);
    if ( il <= NN){
      LAPACK_dstegr(&jobz, &range, &NN,
 		    (double*)DD, (double*)EE,
 		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
 		    &tol, // tolerance
 		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
 		    isuppz,
 		    work, &lwork, iwork, &liwork,
 		    &info);
      for (int i = iu-1; i>= il-1; i--){
 	evals_tmp[i] = evals_tmp[i - (il-1)];
 	if (il>1) evals_tmp[i-(il-1)]=0.;
 	for (int j = 0; j< NN; j++){
 	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
 	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
 	}
      }
    }
    {
      grid->GlobalSumVector(evals_tmp,NN);
      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
    }
  } 
  // Safer to sort instead of just reversing it, 
  // but the document of the routine says evals are sorted in increasing order. 
  // qr gives evals in decreasing order.
  for(int i=0;i<NN;i++){
    lmd [NN-1-i]=evals_tmp[i];
    for(int j=0;j<NN;j++){
      Qt((NN-1-i),j)=evec_tmp[i][j];
    }
  }
 #else 
  assert(0);
 #endif
 }
 void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
 		    int Nk, int Nm,   
 		    Eigen::MatrixXd & Qt,
 		    GridBase *grid)
 {
  int QRiter = 100*Nm;
  int kmin = 1;
  int kmax = Nk;
  // (this should be more sophisticated)
  for(int iter=0; iter<QRiter; ++iter){
    // determination of 2x2 leading submatrix
    RealD dsub = lmd[kmax-1]-lmd[kmax-2];
    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
    RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
    // (Dsh: shift)
    // transformation
    QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
    // Convergence criterion (redef of kmin and kamx)
    for(int j=kmax-1; j>= kmin; --j){
      RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
      if(fabs(lme[j-1])+dds > dds){
 	kmax = j+1;
 	goto continued;
      }
    }
    QRiter = iter;
    return;
  continued:
    for(int j=0; j<kmax-1; ++j){
      RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
      if(fabs(lme[j])+dds > dds){
 	kmin = j+1;
 	break;
      }
    }
  }
  std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
  abort();
 }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -1,406 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
    Copyright (C) 2015
 Author: Christoph Lehner <clehner@bnl.gov>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LOCAL_COHERENCE_IRL_H
 #define GRID_LOCAL_COHERENCE_IRL_H
 namespace Grid { 
 struct LanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
 				  ChebyParams, Cheby,/*Chebyshev*/
 				  int, Nstop,    /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
 				  int, Nk,       /*Vecs in Lanczos seek converge*/
 				  int, Nm,       /*Total vecs in Lanczos include restart*/
 				  RealD, resid,  /*residual*/
 				  int, MaxIt, 
 				  RealD, betastp,  /* ? */
 				  int, MinRes);    // Must restart
 };
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
 				  bool, saveEvecs,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,
 	       			  bool, doCoarseRead,
 				  LanczosParams, FineParams,
 				  LanczosParams, CoarseParams,
 				  ChebyParams,   Smoother,
 				  RealD        , coarse_relax_tol,
 				  std::vector<int>, blockSize,
 				  std::string, config,
 				  std::vector < std::complex<double>  >, omega,
 				  RealD, mass,
 				  RealD, M5);
 };
 // Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearOperatorBase<FineField> &_Linop;
  std::vector<FineField>        &subspace;
  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
    _Linop(linop), subspace(_subspace)
  {  
    assert(subspace.size() >0);
  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid);     fin.checkerboard= checkerboard;
    FineField fout(FineGrid);   fout.checkerboard = checkerboard;
    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
    _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  OperatorFunction<FineField>   & _poly;
  LinearOperatorBase<FineField> &_Linop;
  std::vector<FineField>        &subspace;
  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
 			  LinearOperatorBase<FineField>& linop, 
 			  std::vector<FineField> & _subspace) :
    _poly(poly),
    _Linop(linop),
    subspace(_subspace)
  {  };
  void operator()(const CoarseField& in, CoarseField& out) {
    GridBase *FineGrid = subspace[0]._grid;    
    int   checkerboard = subspace[0].checkerboard;
    FineField fin (FineGrid); fin.checkerboard =checkerboard;
    FineField fout(FineGrid);fout.checkerboard =checkerboard;
    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
    _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj>          FineField;
  LinearFunction<CoarseField> & _Poly;
  OperatorFunction<FineField>   & _smoother;
  LinearOperatorBase<FineField> &_Linop;
  RealD                          _coarse_relax_tol;
  std::vector<FineField>        &_subspace;
  ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
 					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
      _coarse_relax_tol(coarse_relax_tol)  
  {    };
  int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    CoarseField v(B);
    RealD eval_poly = eval;
    // Apply operator
    _Poly(B,v);
    RealD vnum = real(innerProduct(B,v)); // HermOp.
    RealD vden = norm2(B);
    RealD vv0  = norm2(v);
    eval   = vnum/vden;
    v -= eval*B;
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;
    return conv;
  }
  int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
  {
    GridBase *FineGrid = _subspace[0]._grid;    
    int checkerboard   = _subspace[0].checkerboard;
    FineField fB(FineGrid);fB.checkerboard =checkerboard;
    FineField fv(FineGrid);fv.checkerboard =checkerboard;
    blockPromote(B,fv,_subspace);  
    _smoother(_Linop,fv,fB); 
    RealD eval_poly = eval;
    _Linop.HermOp(fB,fv);
    RealD vnum = real(innerProduct(fB,fv)); // HermOp.
    RealD vden = norm2(fB);
    RealD vv0  = norm2(fv);
    eval   = vnum/vden;
    fv -= eval*fB;
    RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
    std::cout.precision(13);
    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
 	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
 	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
 	     <<std::endl;
    if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
    if( (vv<eresid*eresid) ) return 1;
    return 0;
  }
 };
 ////////////////////////////////////////////
 // Make serializable Lanczos params
 ////////////////////////////////////////////
 template<class Fobj,class CComplex,int nbasis>
 class LocalCoherenceLanczos 
 {
 public:
  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
  typedef Lattice<CComplex>                   CoarseScalar; // used for inner products on fine field
  typedef Lattice<CoarseSiteVector>           CoarseField;
  typedef Lattice<Fobj>                       FineField;
 protected:
  GridBase *_CoarseGrid;
  GridBase *_FineGrid;
  int _checkerboard;
  LinearOperatorBase<FineField>                 & _FineOp;
  std::vector<RealD>                              &evals_fine;
  std::vector<RealD>                              &evals_coarse; 
  std::vector<FineField>                          &subspace;
  std::vector<CoarseField>                        &evec_coarse;
 private:
  std::vector<RealD>                              _evals_fine;
  std::vector<RealD>                              _evals_coarse; 
  std::vector<FineField>                          _subspace;
  std::vector<CoarseField>                        _evec_coarse;
 public:
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (_evals_fine),
    evals_coarse(_evals_coarse),
    subspace    (_subspace),
    evec_coarse(_evec_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  //////////////////////////////////////////////////////////////////////////
  // Alternate constructore, external storage for use by Hadrons module
  //////////////////////////////////////////////////////////////////////////
  LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard,
 			std::vector<FineField>   &ext_subspace,
 			std::vector<CoarseField> &ext_coarse,
 			std::vector<RealD>       &ext_eval_fine,
 			std::vector<RealD>       &ext_eval_coarse
 			) :
    _CoarseGrid(CoarseGrid),
    _FineGrid(FineGrid),
    _FineOp(FineOp),
    _checkerboard(checkerboard),
    evals_fine  (ext_eval_fine), 
    evals_coarse(ext_eval_coarse),
    subspace    (ext_subspace),
    evec_coarse (ext_coarse)
  {
    evals_fine.resize(0);
    evals_coarse.resize(0);
  };
  void Orthogonalise(void ) {
    CoarseScalar InnerProd(_CoarseGrid);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
    blockOrthogonalise(InnerProd,subspace);
  };
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = ::sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  /*
  void fakeFine(void)
  {
    int Nk = nbasis;
    subspace.resize(Nk,_FineGrid);
    subspace[0]=1.0;
    subspace[0].checkerboard=_checkerboard;
    normalise(subspace[0]);
    PlainHermOp<FineField>    Op(_FineOp);
    for(int k=1;k<Nk;k++){
      subspace[k].checkerboard=_checkerboard;
      Op(subspace[k-1],subspace[k]);
      normalise(subspace[k]);
    }
  }
  */
  void testFine(RealD resid) 
  {
    assert(evals_fine.size() == nbasis);
    assert(subspace.size() == nbasis);
    PlainHermOp<FineField>    Op(_FineOp);
    ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
    for(int k=0;k<nbasis;k++){
      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
    }
  }
  void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
  {
    assert(evals_fine.size() == nbasis);
    assert(subspace.size() == nbasis);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    for(int k=0;k<evec_coarse.size();k++){
      if ( k < nbasis ) { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
      } else { 
 	assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
      }
    }
  }
  void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, 
 		RealD MaxIt, RealD betastp, int MinRes)
  {
    assert(nbasis<=Nm);
    Chebyshev<FineField>      Cheby(cheby_parms);
    FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
    PlainHermOp<FineField>    Op(_FineOp);
    evals_fine.resize(Nm);
    subspace.resize(Nm,_FineGrid);
    ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
    int Nconv;
    IRL.calc(evals_fine,subspace,src,Nconv,false);
    // Shrink down to number saved
    assert(Nstop>=nbasis);
    assert(Nconv>=nbasis);
    evals_fine.resize(nbasis);
    subspace.resize(nbasis,_FineGrid);
  }
  void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
  {
    Chebyshev<FineField>                          Cheby(cheby_op);
    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
    //////////////////////////////////////////////////////////////////////////////////////////////////
    Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
    evals_coarse.resize(Nm);
    evec_coarse.resize(Nm,_CoarseGrid);
    CoarseField src(_CoarseGrid);     src=1.0; 
    ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
    int Nconv=0;
    IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
    assert(Nconv>=Nstop);
    evals_coarse.resize(Nstop);
    evec_coarse.resize (Nstop,_CoarseGrid);
    for (int i=0;i<Nstop;i++){
      std::cout << i << " Coarse eval = " << evals_coarse[i]  << std::endl;
    }
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/MinimalResidual.h
+++ b/Grid/algorithms/iterative/MinimalResidual.h
@@ -1,156 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/MinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MINIMAL_RESIDUAL_H
 #define GRID_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class Field> class MinimalResidual : public OperatorFunction<Field> {
 public:
  bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
                          // Defaults true.
  RealD   Tolerance;
  Integer MaxIterations;
  RealD   overRelaxParam;
  Integer IterationsToComplete; // Number of iterations the MR took to finish.
                                // Filled in upon completion
  MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
    : Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    Complex a, c;
    Real    d;
    Field Mr(src);
    Field r(src);
    // Initial residual computation & set up
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    Linop.Op(psi, Mr);
    r = src - Mr;
    RealD cp = norm2(r);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:   src " << ssq << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:    mp " << d << std::endl;
    std::cout << GridLogIterative << "MinimalResidual:  cp,r " << cp << std::endl;
    if (cp <= rsq) {
      return;
    }
    std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
    GridStopWatch LinalgTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      MatrixTimer.Start();
      Linop.Op(r, Mr);
      MatrixTimer.Stop();
      LinalgTimer.Start();
      c = innerProduct(Mr, r);
      d = norm2(Mr);
      a = c / d;
      a = a * overRelaxParam;
      psi = psi + r * a;
      r = r - Mr * a;
      cp = norm2(r);
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
                << " residual " << cp << " target " << rsq << std::endl;
      std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        Linop.Op(psi, Mr);
        r = src - Mr;
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "MinimalResidual Converged on iteration " << k
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Total   " << SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Matrix  " << MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MR Time elapsed: Linalg  " << LinalgTimer.Elapsed() << std::endl;
        if (ErrorOnNoConverge)
          assert(true_residual / Tolerance < 10000.0);
        IterationsToComplete = k;
        return;
      }
    }
    std::cout << GridLogMessage << "MinimalResidual did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
    IterationsToComplete = k;
  }
 };
 } // namespace Grid
 #endif
--- a/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
+++ b/Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
@@ -1,273 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
 Copyright (C) 2015
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 #define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
 namespace Grid {
 template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
 class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
 public:
  bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
                          // defaults to true
  RealD   Tolerance;
  Integer MaxIterations;
  Integer RestartLength;
  Integer MaxNumberOfRestarts;
  Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
                          // filled in upon completion
  GridStopWatch MatrixTimer;
  GridStopWatch PrecTimer;
  GridStopWatch LinalgTimer;
  GridStopWatch QrTimer;
  GridStopWatch CompSolutionTimer;
  GridStopWatch ChangePrecTimer;
  Eigen::MatrixXcd H;
  std::vector<std::complex<double>> y;
  std::vector<std::complex<double>> gamma;
  std::vector<std::complex<double>> c;
  std::vector<std::complex<double>> s;
  GridBase* SinglePrecGrid;
  LinearFunction<FieldF> &Preconditioner;
  MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD   tol,
                                                   Integer maxit,
                                                   GridBase * sp_grid,
                                                   LinearFunction<FieldF> &Prec,
                                                   Integer restart_length,
                                                   bool    err_on_no_conv = true)
      : Tolerance(tol)
      , MaxIterations(maxit)
      , RestartLength(restart_length)
      , MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
      , ErrorOnNoConverge(err_on_no_conv)
      , H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
      , y(RestartLength + 1, 0.)
      , gamma(RestartLength + 1, 0.)
      , c(RestartLength + 1, 0.)
      , s(RestartLength + 1, 0.)
      , SinglePrecGrid(sp_grid)
      , Preconditioner(Prec) {};
  void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
    psi.checkerboard = src.checkerboard;
    conformable(psi, src);
    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
    RealD cp;
    RealD ssq = norm2(src);
    RealD rsq = Tolerance * Tolerance * ssq;
    FieldD r(src._grid);
    std::cout << std::setprecision(4) << std::scientific;
    std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
    std::cout << GridLogIterative << "MPFGMRES:   src " << ssq   << std::endl;
    PrecTimer.Reset();
    MatrixTimer.Reset();
    LinalgTimer.Reset();
    QrTimer.Reset();
    CompSolutionTimer.Reset();
    ChangePrecTimer.Reset();
    GridStopWatch SolverTimer;
    SolverTimer.Start();
    IterationCount = 0;
    for (int k=0; k<MaxNumberOfRestarts; k++) {
      cp = outerLoopBody(LinOp, src, psi, rsq);
      // Stopping condition
      if (cp <= rsq) {
        SolverTimer.Stop();
        LinOp.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD srcnorm       = sqrt(ssq);
        RealD resnorm       = sqrt(norm2(r));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage        << "MPFGMRES: Converged on iteration " << IterationCount
                  << " computed residual " << sqrt(cp / ssq)
                  << " true residual "     << true_residual
                  << " target "            << Tolerance << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total      " <<       SolverTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon     " <<         PrecTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix     " <<       MatrixTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg     " <<       LinalgTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR         " <<           QrTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol    " << CompSolutionTimer.Elapsed() << std::endl;
        std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " <<   ChangePrecTimer.Elapsed() << std::endl;
        return;
      }
    }
    std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
    if (ErrorOnNoConverge)
      assert(0);
  }
  RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
    RealD cp = 0;
    FieldD w(src._grid);
    FieldD r(src._grid);
    // these should probably be made class members so that they are only allocated once, not in every restart
    std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
    std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
    MatrixTimer.Start();
    LinOp.Op(psi, w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    r = src - w;
    gamma[0] = sqrt(norm2(r));
    v[0] = (1. / gamma[0]) * r;
    LinalgTimer.Stop();
    for (int i=0; i<RestartLength; i++) {
      IterationCount++;
      arnoldiStep(LinOp, v, z, w, i);
      qrUpdate(i);
      cp = std::norm(gamma[i+1]);
      std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
                << " residual " << cp << " target " << rsq << std::endl;
      if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
        computeSolution(z, psi, i);
        return cp;
      }
    }
    assert(0); // Never reached
    return cp;
  }
  void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
    FieldF v_f(SinglePrecGrid);
    FieldF z_f(SinglePrecGrid);
    ChangePrecTimer.Start();
    precisionChange(v_f, v[iter]);
    precisionChange(z_f, z[iter]);
    ChangePrecTimer.Stop();
    PrecTimer.Start();
    Preconditioner(v_f, z_f);
    PrecTimer.Stop();
    ChangePrecTimer.Start();
    precisionChange(z[iter], z_f);
    ChangePrecTimer.Stop();
    MatrixTimer.Start();
    LinOp.Op(z[iter], w);
    MatrixTimer.Stop();
    LinalgTimer.Start();
    for (int i = 0; i <= iter; ++i) {
      H(iter, i) = innerProduct(v[i], w);
      w = w - H(iter, i) * v[i];
    }
    H(iter, iter + 1) = sqrt(norm2(w));
    v[iter + 1] = (1. / H(iter, iter + 1)) * w;
    LinalgTimer.Stop();
  }
  void qrUpdate(int iter) {
    QrTimer.Start();
    for (int i = 0; i < iter ; ++i) {
      auto tmp       = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
      H(iter, i)     = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
      H(iter, i + 1) = tmp;
    }
    // Compute new Givens Rotation
    ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
    c[iter]     = H(iter, iter) / nu;
    s[iter]     = H(iter, iter + 1) / nu;
    // Apply new Givens rotation
    H(iter, iter)     = nu;
    H(iter, iter + 1) = 0.;
    gamma[iter + 1] = -s[iter] * gamma[iter];
    gamma[iter]     = std::conj(c[iter]) * gamma[iter];
    QrTimer.Stop();
  }
  void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
    CompSolutionTimer.Start();
    for (int i = iter; i >= 0; i--) {
      y[i] = gamma[i];
      for (int k = i + 1; k <= iter; k++)
        y[i] = y[i] - H(k, i) * y[k];
      y[i] = y[i] / H(i, i);
    }
    for (int i = 0; i <= iter; i++)
      psi = psi + z[i] * y[i];
    CompSolutionTimer.Stop();
  }
 };
 }
 #endif
--- a/Grid/algorithms/iterative/PowerMethod.h
+++ b/Grid/algorithms/iterative/PowerMethod.h
@@ -1,45 +0,0 @@
 #pragma once
 namespace Grid {
 template<class Field> class PowerMethod  
 { 
 public: 
  template<typename T>  static RealD normalise(T& v) 
  {
    RealD nn = norm2(v);
    nn = sqrt(nn);
    v = v * (1.0/nn);
    return nn;
  }
  RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) 
  { 
    GridBase *grid = src._grid; 
    // quickly get an idea of the largest eigenvalue to more properly normalize the residuum 
    RealD evalMaxApprox = 0.0; 
    auto src_n = src; 
    auto tmp = src; 
    const int _MAX_ITER_EST_ = 50; 
    for (int i=0;i<_MAX_ITER_EST_;i++) { 
      normalise(src_n); 
      HermOp.HermOp(src_n,tmp); 
      RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. 
      RealD vden = norm2(src_n); 
      RealD na = vnum/vden; 
      if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { 
 	evalMaxApprox = na; 
 	return evalMaxApprox; 
      } 
      evalMaxApprox = na; 
      std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
      src_n = tmp;
    }
    assert(0);
    return 0;
  }
 };
 }
--- a/Grid/algorithms/iterative/SchurRedBlack.h
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -1,486 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_SCHUR_RED_BLACK_H
 #define GRID_SCHUR_RED_BLACK_H
  /*
   * Red black Schur decomposition
   *
   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
   *                =         L                     D                     U
   *
   * L^-1 = (1              0 )
   *        (-MoeMee^{-1}   1 )   
   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
   *           ( 0       1                    )
   *
   * U^-1 = (1   -Mee^{-1} Meo)
   *        (0    1           )
   * U^{dag} = ( 1                 0)
   *           (Meo^dag Mee^{-dag} 1)
   * U^{-dag} = (  1                 0)
   *            (-Meo^dag Mee^{-dag} 1)
   ***********************
   *     M psi = eta
   ***********************
   *Odd
   * i)                 D_oo psi_o =  L^{-1}  eta_o
   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *
   * Wilson:
   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
   * Stag:
   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
   *
   * L^-1 eta_o= (1              0 ) (e
   *             (-MoeMee^{-1}   1 )   
   *
   *Even
   * ii)  Mee psi_e + Meo psi_o = src_e
   *
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
   *
   * 
   * TODO: Other options:
   * 
   * a) change checkerboards for Schur e<->o
   *
   * Left precon by Moo^-1
   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
   *
   * Right precon by Moo^-1
   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
   *                              psi_o = M_oo^-1 phi_o
   * TODO: Deflation 
   */
 namespace Grid {
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Use base class to share code
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take a matrix and form a Red Black solver calling a Herm solver
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackBase {
  protected:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    OperatorFunction<Field> & _HermitianRBSolver;
    int CBfactorise;
    bool subGuess;
    bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
  public:
    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
        const bool _solnAsInitGuess = false)  :
    _HermitianRBSolver(HermitianRBSolver),
    useSolnAsInitGuess(_solnAsInitGuess)
    { 
      CBfactorise = 0;
      subtractGuess(initSubGuess);
    };
    void subtractGuess(const bool initSubGuess)
    {
      subGuess = initSubGuess;
    }
    bool isSubtractGuess(void)
    {
      return subGuess;
    }
    /////////////////////////////////////////////////////////////
    // Shared code
    /////////////////////////////////////////////////////////////
    void operator() (Matrix & _Matrix,const Field &in, Field &out){
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
    {
      ZeroGuesser<Field> guess;
      (*this)(_Matrix,in,out,guess);
    }
    template<class Guesser>
    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      int nblock = in.size();
      std::vector<Field> src_o(nblock,grid);
      std::vector<Field> sol_o(nblock,grid);
      std::vector<Field> guess_save;
      Field resid(fgrid);
      Field tmp(grid);
      ////////////////////////////////////////////////
      // Prepare RedBlack source
      ////////////////////////////////////////////////
      for(int b=0;b<nblock;b++){
 	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
      }
      ////////////////////////////////////////////////
      // Make the guesses
      ////////////////////////////////////////////////
      if ( subGuess ) guess_save.resize(nblock,grid);
      for(int b=0;b<nblock;b++){
        if(useSolnAsInitGuess) {
          pickCheckerboard(Odd, sol_o[b], out[b]);
        } else {
          guess(src_o[b],sol_o[b]); 
        }
 	if ( subGuess ) { 
 	  guess_save[b] = sol_o[b];
 	}
      }
      //////////////////////////////////////////////////////////////
      // Call the block solver
      //////////////////////////////////////////////////////////////
      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
      RedBlackSolve(_Matrix,src_o,sol_o);
      ////////////////////////////////////////////////
      // A2A boolean behavioural control & reconstruct other checkerboard
      ////////////////////////////////////////////////
      for(int b=0;b<nblock;b++) {
 	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
 	///////// Needs even source //////////////
 	pickCheckerboard(Even,tmp,in[b]);
 	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
 	/////////////////////////////////////////////////
 	// Check unprec residual if possible
 	/////////////////////////////////////////////////
 	if ( ! subGuess ) {
 	  _Matrix.M(out[b],resid); 
 	  resid = resid-in[b];
 	  RealD ns = norm2(in[b]);
 	  RealD nr = norm2(resid);
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
 	} else {
 	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
 	}
      }
    }
    template<class Guesser>
    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
      // FIXME CGdiagonalMee not implemented virtual function
      // FIXME use CBfactorise to control schur decomp
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field resid(fgrid);
      Field src_o(grid);
      Field src_e(grid);
      Field sol_o(grid);
      ////////////////////////////////////////////////
      // RedBlack source
      ////////////////////////////////////////////////
      RedBlackSource(_Matrix,in,src_e,src_o);
      ////////////////////////////////
      // Construct the guess
      ////////////////////////////////
      if(useSolnAsInitGuess) {
        pickCheckerboard(Odd, sol_o, out);
      } else {
        guess(src_o,sol_o);
      }
      Field  guess_save(grid);
      guess_save = sol_o;
      //////////////////////////////////////////////////////////////
      // Call the red-black solver
      //////////////////////////////////////////////////////////////
      RedBlackSolve(_Matrix,src_o,sol_o);
      ////////////////////////////////////////////////
      // Fionn A2A boolean behavioural control
      ////////////////////////////////////////////////
      if (subGuess)      sol_o= sol_o-guess_save;
      ///////////////////////////////////////////////////
      // RedBlack solution needs the even source
      ///////////////////////////////////////////////////
      RedBlackSolution(_Matrix,sol_o,src_e,out);
      // Verify the unprec residual
      if ( ! subGuess ) {
        _Matrix.M(out,resid); 
        resid = resid-in;
        RealD ns = norm2(in);
        RealD nr = norm2(resid);
        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
      } else {
        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
      }
    }     
    /////////////////////////////////////////////////////////////
    // Override in derived. 
    /////////////////////////////////////////////////////////////
    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
  };
  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
        const bool _solnAsInitGuess = false) 
      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) 
    {
    }
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field   sol_e(grid);
      Field   src_e(grid);
      src_e = src_e_c; // Const correctness
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal has Mooee on it.
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
        const bool _solnAsInitGuess = false)  
      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
    //////////////////////////////////////////////////////
    // Override RedBlack specialisation
    //////////////////////////////////////////////////////
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   tmp(grid);
      Field  sol_e(grid);
      Field  src_e_i(grid);
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.checkerboard   ==Even);
      src_e_i = src_e-tmp;               assert(  src_e_i.checkerboard ==Even);
      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_e); assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_o); assert(  sol_o.checkerboard ==Odd );
    }
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // Site diagonal is identity, right preconditioned by Mee^inv
  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
  //=> psi = MeeInv phi
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
  public:
    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
    /////////////////////////////////////////////////////
    // Wrap the usual normal equations Schur trick
    /////////////////////////////////////////////////////
  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
      const bool _solnAsInitGuess = false)  
    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      Field   tmp(grid);
      Field  Mtmp(grid);
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd ,src_o,src);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
      // get the right MpcDag
      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.checkerboard ==Odd);       
    }
    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
    {
      GridBase *grid = _Matrix.RedBlackGrid();
      GridBase *fgrid= _Matrix.Grid();
      Field   sol_o_i(grid);
      Field   tmp(grid);
      Field   sol_e(grid);
      ////////////////////////////////////////////////
      // MooeeInv due to pecond
      ////////////////////////////////////////////////
      _Matrix.MooeeInv(sol_o,tmp);
      sol_o_i = tmp;
      ///////////////////////////////////////////////////
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
      ///////////////////////////////////////////////////
      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.checkerboard   ==Even);
      tmp = src_e-tmp;               assert(  src_e.checkerboard ==Even);
      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_e);    assert(  sol_e.checkerboard ==Even);
      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.checkerboard ==Odd );
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
    {
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
    };
    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
    {
      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
    }
  };
 }
 #endif
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@@ -1,76 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::Dimensions(void)        { return _ndimension; };
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 }
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -1,509 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/communicator/SharedMemory.h>
 namespace Grid {
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;
 ////////////////////////////////////////////
 // First initialise of comms system
 ////////////////////////////////////////////
 void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
      assert(0);
  }
  // Never clean up as done once.
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  Grid_quiesce_nodes();
  GlobalSharedMemory::Init(communicator_world);
  GlobalSharedMemory::SharedMemoryAllocate(
 		   GlobalSharedMemory::MAX_MPI_SHM_BYTES,
 		   GlobalSharedMemory::Hugepages);
  Grid_unquiesce_nodes();
 }
 ///////////////////////////////////////////////////////////////////////////
 // Use cartesian communicators now even in MPI3
 ///////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Initialises from communicator_world
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 {
  MPI_Comm optimal_comm;
  ////////////////////////////////////////////////////
  // Remap using the shared memory optimising routine
  // The remap creates a comm which must be freed
  ////////////////////////////////////////////////////
  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
  InitFromMPICommunicator(processors,optimal_comm);
  SetCommunicator(optimal_comm);
  ///////////////////////////////////////////////////
  // Free the temp communicator
  ///////////////////////////////////////////////////
  MPI_Comm_free(&optimal_comm);
 }
 //////////////////////////////////
 // Try to subdivide communicator
 //////////////////////////////////
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)    
 {
  _ndimension = processors.size();  assert(_ndimension>=1);
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
  std::vector<int> parent_processor_coor(_ndimension,0);
  std::vector<int> parent_processors    (_ndimension,1);
  // Can make 5d grid from 4d etc...
  int pad = _ndimension-parent_ndimension;
  for(int d=0;d<parent_ndimension;d++){
    parent_processor_coor[pad+d]=parent._processor_coor[d];
    parent_processors    [pad+d]=parent._processors[d];
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // split the communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  int Nparent = parent._processors ; 
  int Nparent;
  MPI_Comm_size(parent.communicator,&Nparent);
  int childsize=1;
  for(int d=0;d<processors.size();d++) {
    childsize *= processors[d];
  }
  int Nchild = Nparent/childsize;
  assert (childsize * Nchild == Nparent);
  std::vector<int> ccoor(_ndimension); // coor within subcommunicator
  std::vector<int> scoor(_ndimension); // coor of split within parent
  std::vector<int> ssize(_ndimension); // coor of split within parent
  for(int d=0;d<_ndimension;d++){
    ccoor[d] = parent_processor_coor[d] % processors[d];
    scoor[d] = parent_processor_coor[d] / processors[d];
    ssize[d] = parent_processors[d]     / processors[d];
  }
  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
  int crank;  
  // Mpi uses the reverse Lexico convention to us; so reversed routines called
  Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
  Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);      // ssize is the number of split grids
  MPI_Comm comm_split;
  if ( Nchild > 1 ) { 
    if(0){
      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
      std::cout<<std::endl;
      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
      std::cout<<std::endl;
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      // Declare victory
      //////////////////////////////////////////////////////////////////////////////////////////////////////
      std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 		<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
      std::cout << " Split communicator " <<comm_split <<std::endl;
    }
    ////////////////////////////////////////////////////////////////
    // Split the communicator
    ////////////////////////////////////////////////////////////////
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
    assert(ierr==0);
  } else {
    srank = 0;
    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Set up from the new split communicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  InitFromMPICommunicator(processors,comm_split);
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Take the right SHM buffers
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  SetCommunicator(comm_split);
  ///////////////////////////////////////////////
  // Free the temp communicator 
  ///////////////////////////////////////////////
  MPI_Comm_free(&comm_split);
  if(0){ 
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
    for(int d=0;d<processors.size();d++){
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
    }
  }
  for(int d=0;d<processors.size();d++){
    assert(_processor_coor[d] == ccoor[d] );
  }
 }
 void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 {
  ////////////////////////////////////////////////////
  // Creates communicator, and the communicator_halo
  ////////////////////////////////////////////////////
  _ndimension = processors.size();
  _processor_coor.resize(_ndimension);
  /////////////////////////////////
  // Count the requested nodes
  /////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  std::vector<int> periodic(_ndimension,1);
  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  if ( 0 && (communicator_base != communicator_world) ) {
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
    for(int d=0;d<_processors.size();d++){
      std::cout << _processor_coor[d]<<" ";
    }
    std::cout << std::endl;
  }
  int Size;
  MPI_Comm_size(communicator,&Size);
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  assert(Size==_Nprocessors);
 }
 CartesianCommunicator::~CartesianCommunicator()
 {
  int MPI_is_finalised;
  MPI_Finalized(&MPI_is_finalised);
  if (communicator && !MPI_is_finalised) {
    MPI_Comm_free(&communicator);
    for(int i=0;i<communicator_halo.size();i++){
      MPI_Comm_free(&communicator_halo[i]);
    }
  }  
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  //    unsigned long  xcrc = crc32(0L, Z_NULL, 0);
  //    unsigned long  rcrc = crc32(0L, Z_NULL, 0);
  //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
  //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
  //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int dest,
 						     void *recv,
 						     int from,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  int ncomm  =communicator_halo.size(); 
  int commdir=dir%ncomm;
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  int gdest = ShmRanks[dest];
  int gfrom = ShmRanks[from];
  int gme   = ShmRanks[_processor];
  assert(dest != _processor);
  assert(from != _processor);
  assert(gme  == ShmRank);
  double off_node_bytes=0.0;
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  std::vector<int> row(_ndimension,1);
  assert(dim>=0 && dim<_ndimension);
  //  Split the communicator
  row[dim] = _processors[dim];
  int me;
  CartesianCommunicator Comm(row,*this,me);
  Comm.AllToAll(in,out,words,bytes);
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
  // (Turns up on 32^3 x 64 Gparity too)
  MPI_Datatype object;
  int iwords; 
  int ibytes;
  iwords = words;
  ibytes = bytes;
  assert(words == iwords); // safe to cast to int ?
  assert(bytes == ibytes); // safe to cast to int ?
  MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
  MPI_Type_commit(&object);
  MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
  MPI_Type_free(&object);
 }
 }
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -1,92 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid { 
 // static data
 uint64_t            GlobalSharedMemory::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 int                 GlobalSharedMemory::Hugepages = 0;
 int                 GlobalSharedMemory::_ShmSetup;
 int                 GlobalSharedMemory::_ShmAlloc;
 uint64_t            GlobalSharedMemory::_ShmAllocBytes;
 std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
 Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm;
 int                 GlobalSharedMemory::WorldShmRank;
 int                 GlobalSharedMemory::WorldShmSize;
 std::vector<int>    GlobalSharedMemory::WorldShmRanks;
 Grid_MPI_Comm       GlobalSharedMemory::WorldComm;
 int                 GlobalSharedMemory::WorldSize;
 int                 GlobalSharedMemory::WorldRank;
 int                 GlobalSharedMemory::WorldNodes;
 int                 GlobalSharedMemory::WorldNode;
 void GlobalSharedMemory::SharedMemoryFree(void)
 {
  assert(_ShmAlloc);
  assert(_ShmAllocBytes>0);
  for(int r=0;r<WorldShmSize;r++){
    munmap(WorldShmCommBufs[r],_ShmAllocBytes);
  }
  _ShmAlloc = 0;
  _ShmAllocBytes = 0;
 }
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *SharedMemory::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  if (heap_bytes >= heap_size) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
    assert(heap_bytes<heap_size);
  }
  return ptr;
 }
 void SharedMemory::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 void *SharedMemory::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 }
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -1,167 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 // TODO
 // 1) move includes into SharedMemory.cc
 //
 // 2) split shared memory into a) optimal communicator creation from comm world
 // 
 //                             b) shared memory buffers container
 //                                -- static globally shared; init once
 //                                -- per instance set of buffers.
 //                                   
 #pragma once 
 #include <Grid/GridCore.h>
 #if defined (GRID_COMMS_MPI3) 
 #include <mpi.h>
 #endif 
 #include <semaphore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
 namespace Grid {
 #if defined (GRID_COMMS_MPI3) 
  typedef MPI_Comm    Grid_MPI_Comm;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
  typedef int Grid_MPI_Comm;
 #endif
 class GlobalSharedMemory {
 private:
  static const int     MAXLOG2RANKSPERNODE = 16;            
  // Init once lock on the buffer allocation
  static int      _ShmSetup;
  static int      _ShmAlloc;
  static uint64_t _ShmAllocBytes;
 public:
  static int      ShmSetup(void)      { return _ShmSetup; }
  static int      ShmAlloc(void)      { return _ShmAlloc; }
  static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
  static uint64_t      MAX_MPI_SHM_BYTES;
  static int           Hugepages;
  static std::vector<void *> WorldShmCommBufs;
  static Grid_MPI_Comm WorldComm;
  static int           WorldRank;
  static int           WorldSize;
  static Grid_MPI_Comm WorldShmComm;
  static int           WorldShmRank;
  static int           WorldShmSize;
  static int           WorldNodes;
  static int           WorldNode;
  static std::vector<int>  WorldShmRanks;
  //////////////////////////////////////////////////////////////////////////////////////
  // Create an optimal reordered communicator that makes MPI_Cart_create get it right
  //////////////////////////////////////////////////////////////////////////////////////
  static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
  static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  static void OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm);  // Turns MPI_COMM_WORLD into right layout for Cartesian
  ///////////////////////////////////////////////////
  // Provide shared memory facilities off comm world
  ///////////////////////////////////////////////////
  static void SharedMemoryAllocate(uint64_t bytes, int flags);
  static void SharedMemoryFree(void);
 };
 //////////////////////////////
 // one per communicator
 //////////////////////////////
 class SharedMemory 
 {
 private:
  static const int     MAXLOG2RANKSPERNODE = 16;            
  size_t heap_top;
  size_t heap_bytes;
  size_t heap_size;
 protected:
  Grid_MPI_Comm    ShmComm; // for barriers
  int    ShmRank; 
  int    ShmSize;
  std::vector<void *> ShmCommBufs;
  std::vector<int>    ShmRanks;// Mapping comm ranks to Shm ranks
 public:
  SharedMemory() {};
  ~SharedMemory();
  ///////////////////////////////////////////////////////////////////////////////////////
  // set the buffers & sizes
  ///////////////////////////////////////////////////////////////////////////////////////
  void SetCommunicator(Grid_MPI_Comm comm);
  ////////////////////////////////////////////////////////////////////////
  // For this instance ; disjoint buffer sets between splits if split grid
  ////////////////////////////////////////////////////////////////////////
  void ShmBarrier(void); 
  ///////////////////////////////////////////////////
  // Call on any instance
  ///////////////////////////////////////////////////
  void SharedMemoryTest(void);
  void *ShmBufferSelf(void);
  void *ShmBuffer    (int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void  ShmBufferFreeAll(void) ;
  //////////////////////////////////////////////////////////////////////////
  // Make info on Nodes & ranks and Shared memory available
  //////////////////////////////////////////////////////////////////////////
  int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
  int RankCount(void) { return GlobalSharedMemory::WorldSize;};
 };
 }
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -1,667 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <pwd.h>
 namespace Grid { 
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
  assert(_ShmSetup==0);
  WorldComm = comm;
  MPI_Comm_rank(WorldComm,&WorldRank);
  MPI_Comm_size(WorldComm,&WorldSize);
  // WorldComm, WorldSize, WorldRank
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
  MPI_Comm_rank(WorldShmComm     ,&WorldShmRank);
  MPI_Comm_size(WorldShmComm     ,&WorldShmSize);
  // WorldShmComm, WorldShmSize, WorldShmRank
  // WorldNodes
  WorldNodes = WorldSize/WorldShmSize;
  assert( (WorldNodes * WorldShmSize) == WorldSize );
  // FIXME: Check all WorldShmSize are the same ?
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (WorldComm, &WorldGroup); 
  MPI_Comm_group (WorldShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize);   for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  WorldShmRanks.resize(WorldSize); 
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and nominate the leader
  ///////////////////////////////////////////////////////////////////
  int g=0;
  std::vector<int> MyGroup;
  MyGroup.resize(WorldShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(WorldShmRanks[rank]!=MPI_UNDEFINED){
      assert(g<WorldShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(WorldNodes,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the node of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  WorldNode=-1;
  for(int g=0;g<WorldNodes;g++){
    if (myleader == leaders_group[g]){
      WorldNode=g;
    }
  }
  assert(WorldNode!=-1);
  _ShmSetup=1;
 }
 // Gray encode support 
 int BinaryToGray (int  binary) {
  int gray = (binary>>1)^binary;
  return gray;
 }
 int Log2Size(int TwoToPower,int MAXLOG2)
 {
  int log2size = -1;
  for(int i=0;i<=MAXLOG2;i++){
    if ( (0x1<<i) == TwoToPower ) {
      log2size = i;
      break;
    }
  }
  return log2size;
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
  //////////////////////////////////////////////////////////////////////////////
  // Look and see if it looks like an HPE 8600 based on hostname conventions
  //////////////////////////////////////////////////////////////////////////////
  const int namelen = _POSIX_HOST_NAME_MAX;
  char name[namelen];
  int R;
  int I;
  int N;
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
  if(nscan==3) OptimalCommunicatorHypercube(processors,optimal_comm);
  else         OptimalCommunicatorSharedMemory(processors,optimal_comm);
 }
 void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify the hypercube coordinate of this node using hostname
  ////////////////////////////////////////////////////////////////
  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits
  // i runs 0..7                                    3 bits
  // r runs 0..3                                    2 bits
  // 2^10 = 1024 nodes
  const int maxhdim = 10; 
  std::vector<int> HyperCubeCoords(maxhdim,0);
  std::vector<int> RootHyperCubeCoords(maxhdim,0);
  int R;
  int I;
  int N;
  const int namelen = _POSIX_HOST_NAME_MAX;
  char name[namelen];
  // Parse ICE-XA hostname to get hypercube location
  gethostname(name,namelen);
  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
  assert(nscan==3);
  int nlo = N%9;
  int nhi = N/9;
  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
  uint32_t rootcoor  = hypercoor;
  //////////////////////////////////////////////////////////////////
  // Print debug info
  //////////////////////////////////////////////////////////////////
  for(int d=0;d<maxhdim;d++){
    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
  }
  std::string hname(name);
  std::cout << "hostname "<<hname<<std::endl;
  std::cout << "R " << R << " I " << I << " N "<< N
            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
  //////////////////////////////////////////////////////////////////
  // broadcast node 0's base coordinate for this partition.
  //////////////////////////////////////////////////////////////////
  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
  hypercoor=hypercoor-rootcoor;
  assert(hypercoor<WorldSize);
  assert(hypercoor>=0);
  //////////////////////////////////////
  // Printing
  //////////////////////////////////////
  for(int d=0;d<maxhdim;d++){
    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
  }
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  std::vector<int> processor_coor(ndimension);
  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
  std::vector<int> HyperCoor(ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<ndimension;d++){
    NodeDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Map Hcube according to physical lattice 
  // must partition. Loop over dims and find out who would join.
  ////////////////////////////////////////////////////////////////
  int hcoor = hypercoor;
  for(int d=0;d<ndimension;d++){
     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
     int msk  = (0x1<<bits)-1;
     HyperCoor[d]=hcoor & msk;  
     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
     hcoor = hcoor >> bits;
  } 
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  int Nprocessors=1;
  for(int i=0;i<ndimension;i++){
    Nprocessors*=processors[i];
  }
  assert(WorldSize==Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  int rank;
  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
  /////////////////////////////////////////////////////////////////
  // Build the new communicator
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
 void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int ndimension              = processors.size();
  std::vector<int> processor_coor(ndimension);
  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
  int dim = 0;
  for(int l2=0;l2<log2size;l2++){
    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<ndimension;d++){
    NodeDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  int Nprocessors=1;
  for(int i=0;i<ndimension;i++){
    Nprocessors*=processors[i];
  }
  assert(WorldSize==Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  ////////////////////////////////////////////////////////////////
  int rank;
  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
  /////////////////////////////////////////////////////////////////
  // Build the new communicator
  /////////////////////////////////////////////////////////////////
  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
  assert(ierr==0);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // SHMGET
 ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMGET
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  std::vector<int> shmids(WorldShmSize);
  if ( WorldShmRank == 0 ) {
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes;
      key_t key   = IPC_PRIVATE;
      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      if (Hugepages) flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
        int errsv = errno;
        printf("Errno %d\n",errsv);
        printf("key   %d\n",key);
        printf("size  %ld\n",size);
        printf("flags %d\n",flags);
        perror("shmget");
        exit(1);
      }
    }
  }
  MPI_Barrier(WorldShmComm);
  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
  MPI_Barrier(WorldShmComm);
  for(int r=0;r<WorldShmSize;r++){
    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
    if (WorldShmCommBufs[r] == (uint64_t *)-1) {
      perror("Shared memory attach failure");
      shmctl(shmids[r], IPC_RMID, NULL);
      exit(2);
    }
  }
  MPI_Barrier(WorldShmComm);
  ///////////////////////////////////
  // Mark for clean up
  ///////////////////////////////////
  for(int r=0;r<WorldShmSize;r++){
    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
  }
  MPI_Barrier(WorldShmComm);
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 }
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbfs and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  for(int r=0;r<WorldShmSize;r++){
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
    if ( fd == -1) { 
      printf("open %s failed\n",shm_name);
      perror("open hugetlbfs");
      exit(0);
    }
    int mmap_flag = MAP_SHARED ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHM_NONE
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ////////////////////////////////////////////////////////////////////////////////////////////
  // Hugetlbf and others map filesystems as mappable huge pages
  ////////////////////////////////////////////////////////////////////////////////////////////
  char shm_name [NAME_MAX];
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    int fd=-1;
    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
 #ifdef MAP_POPULATE    
    mmap_flag|=MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
    if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
    if ( ptr == (void *)MAP_FAILED ) {    
      printf("mmap %s failed\n",shm_name);
      perror("failed mmap");      assert(0);    
    }
    assert(((uint64_t)ptr&0x3F)==0);
    close(fd);
    WorldShmCommBufs[r] =ptr;
    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
 #endif // MMAP
 #ifdef GRID_MPI3_SHMOPEN
 ////////////////////////////////////////////////////////////////////////////////////////////
 // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
 // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
 // the posix shm virtual file system
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0); 
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  char shm_name [NAME_MAX];
  if ( WorldShmRank == 0 ) {
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes;
      struct passwd *pw = getpwuid (getuid());
      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
      int mmap_flag = MAP_SHARED;
 #ifdef MAP_POPULATE 
      mmap_flag |= MAP_POPULATE;
 #endif
 #ifdef MAP_HUGETLB
      if (flags) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
      if ( ptr == (void * )MAP_FAILED ) {       
 	perror("failed mmap");     
 	assert(0);    
      }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
      close(fd);
    }
  }
  MPI_Barrier(WorldShmComm);
  if ( WorldShmRank != 0 ) { 
    for(int r=0;r<WorldShmSize;r++){
      size_t size = bytes ;
      struct passwd *pw = getpwuid (getuid());
      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
      int fd=shm_open(shm_name,O_RDWR,0666);
      if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
      WorldShmCommBufs[r] =ptr;
      close(fd);
    }
  }
  _ShmAlloc=1;
  _ShmAllocBytes = bytes;
 }
 #endif
  ////////////////////////////////////////////////////////
  // Global shared functionality finished
  // Now move to per communicator functionality
  ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
  int rank, size;
  MPI_Comm_rank(comm,&rank);
  MPI_Comm_size(comm,&size);
  ShmRanks.resize(size);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  ShmCommBufs.resize(ShmSize);
  //////////////////////////////////////////////////////////////////////
  // Map ShmRank to WorldShmRank and use the right buffer
  //////////////////////////////////////////////////////////////////////
  assert (GlobalSharedMemory::ShmAlloc()==1);
  heap_size = GlobalSharedMemory::ShmAllocBytes();
  for(int r=0;r<ShmSize;r++){
    uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
    ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
    //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl;
  }
  ShmBufferFreeAll();
  /////////////////////////////////////////////////////////////////////
  // find comm ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group FullGroup, ShmGroup;
  MPI_Comm_group (comm   , &FullGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
  MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
 //////////////////////////////////////////////////////////////////
 void SharedMemory::ShmBarrier(void)
 {
  MPI_Barrier  (ShmComm);
 }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Test the shared memory is working
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 void SharedMemory::SharedMemoryTest(void)
 {
  ShmBarrier();
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      uint64_t * check = (uint64_t *) ShmCommBufs[r];
      check[0] = GlobalSharedMemory::WorldNode;
      check[1] = r;
      check[2] = 0x5A5A5A;
    }
  }
  ShmBarrier();
  for(int r=0;r<ShmSize;r++){
    uint64_t * check = (uint64_t *) ShmCommBufs[r];
    assert(check[0]==GlobalSharedMemory::WorldNode);
    assert(check[1]==r);
    assert(check[2]==0x5A5A5A);
  }
  ShmBarrier();
 }
 void *SharedMemory::ShmBuffer(int rank)
 {
  int gpeer = ShmRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  static int count =0;
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 SharedMemory::~SharedMemory()
 {
  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
  if ( !MPI_is_finalised ) { 
    MPI_Comm_free(&ShmComm);
  }
 };
 }
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -1,128 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/SharedMemory.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 namespace Grid { 
 /*Construct from an MPI communicator*/
 void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
 {
  assert(_ShmSetup==0);
  WorldComm = 0;
  WorldRank = 0;
  WorldSize = 1;
  WorldShmComm = 0 ;
  WorldShmRank = 0 ;
  WorldShmSize = 1 ;
  WorldNodes   = 1 ;
  WorldNode    = 0 ;
  WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
  WorldShmCommBufs.resize(1);
  _ShmSetup=1;
 }
 void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
 {
  optimal_comm = WorldComm;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended, use anonymous mmap
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  int mmap_flag =0;
 #ifdef MAP_ANONYMOUS
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
 #endif
 #ifdef MAP_ANON
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
 #endif
 #ifdef MAP_HUGETLB
  if ( flags ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #ifdef MADV_HUGEPAGE
  if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
 #endif
  bzero(ShmCommBuf,bytes);
  WorldShmCommBufs[0] = ShmCommBuf;
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 };
  ////////////////////////////////////////////////////////
  // Global shared functionality finished
  // Now move to per communicator functionality
  ////////////////////////////////////////////////////////
 void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 {
  assert(GlobalSharedMemory::ShmAlloc()==1);
  ShmRanks.resize(1);
  ShmCommBufs.resize(1);
  ShmRanks[0] = 0;
  ShmRank     = 0;
  ShmSize     = 1;
  //////////////////////////////////////////////////////////////////////
  // Map ShmRank to WorldShmRank and use the right buffer
  //////////////////////////////////////////////////////////////////////
  ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
  heap_size      = GlobalSharedMemory::ShmAllocBytes();
  ShmBufferFreeAll();
  return;
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
 //////////////////////////////////////////////////////////////////
 void SharedMemory::ShmBarrier(void){ return ; }
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Test the shared memory is working
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 void SharedMemory::SharedMemoryTest(void) { return; }
 void *SharedMemory::ShmBuffer(int rank)
 {
  return NULL;
 }
 void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  return NULL;
 }
 SharedMemory::~SharedMemory()
 {};
 }
--- a/Grid/parallelIO/BinaryIO.cc
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -1,3 +0,0 @@
 #include <Grid/GridCore.h>
 int Grid::BinaryIO::latticeWriteMaxRetry = -1;
--- a/Grid/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@@ -1,142 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/DomainWallFermion.h
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H
 #define  GRID_QCD_DOMAIN_WALL_FERMION_H
 #include <Grid/qcd/action/fermion/FermionCore.h>
 namespace Grid {
  namespace QCD {
    template<class Impl>
    class DomainWallFermion : public CayleyFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist, bool fiveD) {
 	FermionField in_k(in._grid);
 	FermionField prop_k(in._grid);
 	FFT theFFT((GridCartesian *) in._grid);
 	//phase for boundary condition
 	ComplexField coor(in._grid);
 	ComplexField ph(in._grid);  ph = zero;
 	FermionField in_buf(in._grid); in_buf = zero;
 	Scalar ci(0.0,1.0);
 	assert(twist.size() == Nd);//check that twist is Nd
 	assert(boundary.size() == Nd);//check that boundary conditions is Nd
 	int shift = 0;
 	if(fiveD) shift = 1;
 	for(unsigned int nu = 0; nu < Nd; nu++)
 	{
 	  // Shift coordinate lattice index by 1 to account for 5th dimension.
          LatticeCoordinate(coor, nu + shift);
 	  double boundary_phase = ::acos(real(boundary[nu]));
 	  ph = ph + boundary_phase*coor*((1./(in._grid->_fdimensions[nu+shift])));
 	  //momenta for propagator shifted by twist+boundary
 	  twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
 	}
 	in_buf = exp(ci*ph*(-1.0))*in;
 	if(fiveD){//FFT only on temporal and spatial dimensions
          std::vector<int> mask(Nd+1,1); mask[0] = 0;
 	  theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward);
          this->MomentumSpacePropagatorHt_5d(prop_k,in_k,mass,twist);
          theFFT.FFT_dim_mask(out,prop_k,mask,FFT::backward);
        }
 	else{
 	  theFFT.FFT_all_dim(in_k,in,FFT::forward);
          this->MomentumSpacePropagatorHt(prop_k,in_k,mass,twist);
 	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
        }
 	//phase for boundary condition
 	out = out * exp(ci*ph);
      };
      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {
        bool fiveD = true; //5d propagator by default
 	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };
      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass, bool fiveD) {
 	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
 	std::vector<Complex> boundary;
 	for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
 	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };
      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
        bool fiveD = true; //5d propagator by default
 	std::vector<double> twist(Nd,0.0); //default: twist angle 0
 	std::vector<Complex> boundary;
 	for(int i=0;i<Nd;i++) boundary.push_back(1); //default: periodic boundary conditions
 	FreePropagator(in,out,mass,boundary,twist,fiveD);
      };
      virtual void   Instantiatable(void) {};
      // Constructors
      DomainWallFermion(GaugeField &_Umu,
 			GridCartesian         &FiveDimGrid,
 			GridRedBlackCartesian &FiveDimRedBlackGrid,
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
 			RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) : 
      CayleyFermion5D<Impl>(_Umu,
 			    FiveDimGrid,
 			    FiveDimRedBlackGrid,
 			    FourDimGrid,
 			    FourDimRedBlackGrid,_mass,_M5,p)
      {
 	RealD eps = 1.0;
 	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 	assert(zdata->n==this->Ls);
 	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 	// Call base setter
 	this->SetCoefficientsTanh(zdata,1.0,0.0);
 	Approx::zolotarev_free(zdata);
      }
    };
  }
 }
 #endif
--- a/Grid/qcd/action/fermion/FourierAcceleratedPV.h
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@@ -1,237 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/FourierAcceleratedPV.h
    Copyright (C) 2015
 Author: Christoph Lehner (lifted with permission by Peter Boyle, brought back to Grid)
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
  template<typename M>
    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
    ComplexD b,c;
    b=m.bs[0];
    c=m.cs[0];
    std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
    for (size_t i=1;i<m.bs.size();i++) {
      assert(m.bs[i] == b);
      assert(m.cs[i] == c);
    }
    assert(b.imag() == 0.0);
    assert(c.imag() == 0.0);
    _b = b.real();
    _c = c.real();
  }
 template<typename Vi, typename M, typename G>
 class FourierAcceleratedPV {
 public:
  ConjugateGradient<Vi> &cg;
  M& dwfPV;
  G& Umu;
  GridCartesian* grid5D;
  GridRedBlackCartesian* gridRB5D;
  int group_in_s;
  FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2) 
   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
  {
    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
    grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
    gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
  }
  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
    GridStopWatch gsw1, gsw2;
    typedef typename Vi::scalar_type Coeff_t;
    int Ls = dst._grid->_fdimensions[0];
    Vi _tmp(dst._grid);
    double phase = M_PI / (double)Ls;
    Coeff_t bzero(0.0,0.0);
    FFT theFFT((GridCartesian*)dst._grid);
    if (!forward) {
      gsw1.Start();
      for (int s=0;s<Ls;s++) {
 	Coeff_t a(::cos(phase*s),-::sin(phase*s));
 	axpby_ssp(_tmp,a,_src,bzero,_src,s,s);
      }
      gsw1.Stop();
      gsw2.Start();
      theFFT.FFT_dim(dst,_tmp,0,FFT::forward);
      gsw2.Stop();
    } else {
      gsw2.Start();
      theFFT.FFT_dim(_tmp,_src,0,FFT::backward);
      gsw2.Stop();
      gsw1.Start();
      for (int s=0;s<Ls;s++) {
 	Coeff_t a(::cos(phase*s),::sin(phase*s));
 	axpby_ssp(dst,a,_tmp,bzero,_tmp,s,s);
      }
      gsw1.Stop();
    }
    std::cout << GridLogMessage << "Timing rotatePV: " << gsw1.Elapsed() << ", " << gsw2.Elapsed() << std::endl;
  }
  void pvInv(const Vi& _src, Vi& _dst) const {
    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;
    typedef typename Vi::scalar_type Coeff_t;
    int Ls = _dst._grid->_fdimensions[0];
    GridStopWatch gswT;
    gswT.Start();
    RealD b,c;
    get_real_const_bc(dwfPV,b,c);
    RealD M5 = dwfPV.M5;
    // U(true) Rightinv TMinv U(false) = Minv
    Vi _src_diag(_dst._grid);
    Vi _src_diag_slice(dwfPV.GaugeGrid());
    Vi _dst_diag_slice(dwfPV.GaugeGrid());
    Vi _src_diag_slices(grid5D);
    Vi _dst_diag_slices(grid5D);
    Vi _dst_diag(_dst._grid);
    rotatePV(_src,_src_diag,false);
    // now do TM solves
    Gamma G5(Gamma::Algebra::Gamma5);
    GridStopWatch gswA, gswB;
    gswA.Start();
    typedef typename M::Impl_t Impl;
    //WilsonTMFermion<Impl> tm(x.Umu,*x.UGridF,*x.UrbGridF,0.0,0.0,solver_outer.parent.par.wparams_f);
    std::vector<RealD> vmass(grid5D->_fdimensions[0],0.0);
    std::vector<RealD> vmu(grid5D->_fdimensions[0],0.0);
    WilsonTMFermion5D<Impl> tm(Umu,*grid5D,*gridRB5D,
 			   *(GridCartesian*)dwfPV.GaugeGrid(),
 			   *(GridRedBlackCartesian*)dwfPV.GaugeRedBlackGrid(),
 			   vmass,vmu);
    //SchurRedBlackDiagTwoSolve<Vi> sol(cg);
    SchurRedBlackDiagMooeeSolve<Vi> sol(cg); // same performance as DiagTwo
    gswA.Stop();
    gswB.Start();
    for (int sgroup=0;sgroup<Ls/2/group_in_s;sgroup++) {
      for (int sidx=0;sidx<group_in_s;sidx++) {
 	int s = sgroup*group_in_s + sidx;
 	int sprime = Ls-s-1;
 	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
 	RealD cosp = ::cos(phase);
 	RealD sinp = ::sin(phase);
 	RealD denom = b*b + c*c + 2.0*b*c*cosp;
 	RealD mass = -(b*b*M5 + c*(1.0 - cosp + c*M5) + b*(-1.0 + cosp + 2.0*c*cosp*M5))/denom;
 	RealD mu = (b+c)*sinp/denom;
 	vmass[2*sidx + 0] = mass;
 	vmass[2*sidx + 1] = mass;
 	vmu[2*sidx + 0] = mu;
 	vmu[2*sidx + 1] = -mu;
      }
      tm.update(vmass,vmu);
      for (int sidx=0;sidx<group_in_s;sidx++) {
 	int s = sgroup*group_in_s + sidx;
 	int sprime = Ls-s-1;
 	ExtractSlice(_src_diag_slice,_src_diag,s,0);
 	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 0,0);
 	ExtractSlice(_src_diag_slice,_src_diag,sprime,0);
 	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 1,0);
      }
      GridStopWatch gsw;
      gsw.Start();
      _dst_diag_slices = zero; // zero guess
      sol(tm,_src_diag_slices,_dst_diag_slices);
      gsw.Stop();
      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
      for (int sidx=0;sidx<group_in_s;sidx++) {
 	int s = sgroup*group_in_s + sidx;
 	int sprime = Ls-s-1;
 	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
 	RealD cosp = ::cos(phase);
 	RealD sinp = ::sin(phase);
 	// now rotate with inverse of
 	Coeff_t pA = b + c*cosp;
 	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
 	Coeff_t pABden = pA*pA - pB*pB;
 	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
 	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 0,0);
 	_dst_diag_slice = (pA/pABden) * _dst_diag_slice - (pB/pABden) * (G5 * _dst_diag_slice);
 	InsertSlice(_dst_diag_slice,_dst_diag,s,0);
 	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 1,0);
 	_dst_diag_slice = (pA/pABden) * _dst_diag_slice + (pB/pABden) * (G5 * _dst_diag_slice);
 	InsertSlice(_dst_diag_slice,_dst_diag,sprime,0);
      }
    }
    gswB.Stop();
    rotatePV(_dst_diag,_dst,true);
    gswT.Stop();
    std::cout << GridLogMessage << "PV completed in " << gswT.Elapsed() << " (Setup: " << gswA.Elapsed() << ", s-loop: " << gswB.Elapsed() << ")" << std::endl;
  }
 };
 }}
--- a/Grid/qcd/action/fermion/MADWF.h
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -1,193 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/MADWF.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
 template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
 inline void convert(const Fieldi &from,Fieldo &to) 
 {
  precisionChange(to,from);
 }
 template <class Fieldi, class Fieldo,IfSame<Fieldi,Fieldo> X=0>
 inline void convert(const Fieldi &from,Fieldo &to) 
 {
  to=from;
 }
 template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
 class MADWF 
 {
 private:
  typedef typename Matrixo::FermionField FermionFieldo;
  typedef typename Matrixi::FermionField FermionFieldi;
  PVinverter  & PauliVillarsSolvero;// For the outer field
  SchurSolver & SchurSolveri;       // For the inner approx field
  Guesser     & Guesseri;           // To deflate the inner approx solves
  Matrixo & Mato;                   // Action object for outer
  Matrixi & Mati;                   // Action object for inner
  RealD target_resid;
  int   maxiter;
 public:
  MADWF(Matrixo &_Mato,
 	Matrixi &_Mati, 
 	PVinverter &_PauliVillarsSolvero, 
 	SchurSolver &_SchurSolveri,
 	Guesser & _Guesseri,
 	RealD resid,
 	int _maxiter) :
  Mato(_Mato),Mati(_Mati),
    SchurSolveri(_SchurSolveri),
    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
  {   
    target_resid=resid;
    maxiter     =_maxiter; 
  };
  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    FermionFieldi    c0i(Mati.GaugeGrid()); // 4d 
    FermionFieldi    y0i(Mati.GaugeGrid()); // 4d
    FermionFieldo    c0 (Mato.GaugeGrid()); // 4d 
    FermionFieldo    y0 (Mato.GaugeGrid()); // 4d
    FermionFieldo    A(Mato.FermionGrid()); // Temporary outer
    FermionFieldo    B(Mato.FermionGrid()); // Temporary outer
    FermionFieldo    b(Mato.FermionGrid()); // 5d source
    FermionFieldo    c(Mato.FermionGrid()); // PVinv source; reused so store
    FermionFieldo    defect(Mato.FermionGrid()); // 5d source
    FermionFieldi   ci(Mati.FermionGrid()); 
    FermionFieldi   yi(Mati.FermionGrid()); 
    FermionFieldi   xi(Mati.FermionGrid()); 
    FermionFieldi srci(Mati.FermionGrid()); 
    FermionFieldi   Ai(Mati.FermionGrid()); 
    RealD m=Mati.Mass();
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
    Mato.ImportPhysicalFermionSource(src4,b); 
    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;
    defect = b;
    sol5=zero;
    for (int i=0;i<maxiter;i++) {
      ///////////////////////////////////////
      // Set up c0 from current defect
      ///////////////////////////////////////
      PauliVillarsSolvero(Mato,defect,A);
      Mato.Pdag(A,c);
      ExtractSlice(c0, c, 0 , 0);
      ////////////////////////////////////////////////
      // Solve the inner system with surface term c0
      ////////////////////////////////////////////////
      ci = zero;  
      convert(c0,c0i); // Possible precison change
      InsertSlice(c0i,ci,0, 0);
      // Dwm P y = Dwm x = D(1) P (c0,0,0,0)^T
      Mati.P(ci,Ai);
      Mati.SetMass(1.0);      Mati.M(Ai,srci);      Mati.SetMass(m);
      SchurSolveri(Mati,srci,xi,Guesseri); 
      Mati.Pdag(xi,yi);
      ExtractSlice(y0i, yi, 0 , 0);
      convert(y0i,y0); // Possible precision change
      //////////////////////////////////////
      // Propagate solution back to outer system
      // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
      //////////////////////////////////////
      c0 = - y0;
      InsertSlice(c0, c, 0   , 0);
      /////////////////////////////
      // Reconstruct the bulk solution Pdag PV^-1 Dm P 
      /////////////////////////////
      Mato.P(c,B);
      Mato.M(B,A);
      PauliVillarsSolvero(Mato,A,B);
      Mato.Pdag(B,A);
      //////////////////////////////
      // Reinsert surface prop
      //////////////////////////////
      InsertSlice(y0,A,0,0);
      //////////////////////////////
      // Convert from y back to x 
      //////////////////////////////
      Mato.P(A,B);
      //         sol5' = sol5 + M^-1 defect
      //               = sol5 + M^-1 src - M^-1 M sol5  ...
      sol5 = sol5 + B;
      std::cout << GridLogMessage << "***************************************" <<std::endl;
      std::cout << GridLogMessage << " Sol5 update "<<std::endl;
      std::cout << GridLogMessage << "***************************************" <<std::endl;
      std::cout << GridLogMessage << " Sol5 now "<<norm2(sol5)<<std::endl;
      std::cout << GridLogMessage << " delta    "<<norm2(B)<<std::endl;
       // New defect  = b - M sol5
       Mato.M(sol5,A);
       defect = b - A;
       std::cout << GridLogMessage << " defect   "<<norm2(defect)<<std::endl;
       double resid = ::sqrt(norm2(defect) / norm2(b));
       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
       std::cout << GridLogMessage << "***************************************" <<std::endl;
       if (resid < target_resid) {
 	 return;
       }
    }
    std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
    assert(0);
  }
 };
 }}
--- a/Grid/qcd/action/fermion/PauliVillarsInverters.h
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@@ -1,95 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
 template<class Field>
 class PauliVillarsSolverUnprec
 {
 public:
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
    _Matrix.SetMass(1.0);
    _Matrix.Mdag(src,A);
    CG(HermOp,A,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field,class SchurSolverType>
 class PauliVillarsSolverRBprec
 {
 public:
  SchurSolverType & SchurSolver;
  PauliVillarsSolverRBprec( SchurSolverType &_SchurSolver) : SchurSolver(_SchurSolver){};
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    RealD m = _Matrix.Mass();
    Field A  (_Matrix.FermionGrid());
    _Matrix.SetMass(1.0);
    SchurSolver(_Matrix,src,sol);
    _Matrix.SetMass(m);
  };
 };
 template<class Field,class GaugeField>
 class PauliVillarsSolverFourierAccel
 {
 public:
  GaugeField      & Umu;
  ConjugateGradient<Field> & CG;
  PauliVillarsSolverFourierAccel(GaugeField &_Umu,ConjugateGradient<Field> &_CG) :  Umu(_Umu), CG(_CG)
  {
  };
  template<class Matrix>
  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
  {
    FourierAcceleratedPV<Field, Matrix, typename Matrix::GaugeField > faPV(_Matrix,Umu,CG) ;
    faPV.pvInv(src,sol);
  };
 };
 }
 }
--- a/Grid/qcd/action/fermion/Reconstruct5Dprop.h
+++ b/Grid/qcd/action/fermion/Reconstruct5Dprop.h
@@ -1,135 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once
 namespace Grid {
 namespace QCD {
 template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
 private:
  PVinverter & PauliVillarsSolver;
 public:
 /////////////////////////////////////////////////////
 // First cut works, 10 Oct 2018.
 //
 // Must form a plan to get this into production for Zmobius acceleration
 // of the Mobius exact AMA corrections.
 //
 // TODO : understand absence of contact term in eqns in Hantao's thesis
 //        sol4 is contact term subtracted, but thesis & Brower's paper suggests not.
 //
 // Step 1: Localise PV inverse in a routine. [DONE]
 // Step 2: Schur based PV inverse            [DONE]
 // Step 3: Fourier accelerated PV inverse    [DONE]
 //
 /////////////////////////////////////////////////////
  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
    : PauliVillarsSolver(_PauliVillarsSolver) 
  { 
  };
   template<class Matrix>
   void PV(Matrix &_Matrix,const Field &src,Field &sol)
   {
     RealD m = _Matrix.Mass();
     _Matrix.SetMass(1.0);
     _Matrix.M(src,sol);
     _Matrix.SetMass(m);
   }
   template<class Matrix>
   void PVdag(Matrix &_Matrix,const Field &src,Field &sol)
   {
     RealD m = _Matrix.Mass();
     _Matrix.SetMass(1.0);
     _Matrix.Mdag(src,sol);
     _Matrix.SetMass(m);
   }
  template<class Matrix>
  void operator() (Matrix & _Matrix,const Field &sol4,const Field &src4, Field &sol5){
    int Ls =  _Matrix.Ls;
    Field psi4(_Matrix.GaugeGrid());
    Field psi(_Matrix.FermionGrid());
    Field A  (_Matrix.FermionGrid());
    Field B  (_Matrix.FermionGrid());
    Field c  (_Matrix.FermionGrid());
    typedef typename Matrix::Coeff_t Coeff_t;
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< " Reconstruct5Dprop: c.f. MADWF algorithm         " << std::endl;
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
    _Matrix.ImportPhysicalFermionSource(src4,B); 
    ///////////////////////////////////////
    // Set up c from src4
    ///////////////////////////////////////
    PauliVillarsSolver(_Matrix,B,A);
    _Matrix.Pdag(A,c);
    //////////////////////////////////////
    // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
    //////////////////////////////////////
    psi4 = - sol4;
    InsertSlice(psi4, psi, 0   , 0);
    for (int s=1;s<Ls;s++) {
      ExtractSlice(psi4,c,s,0);
       InsertSlice(psi4,psi,s,0);
    }
    /////////////////////////////
    // Pdag PV^-1 Dm P 
    /////////////////////////////
    _Matrix.P(psi,B);
    _Matrix.M(B,A);
    PauliVillarsSolver(_Matrix,A,B);
    _Matrix.Pdag(B,A);
    //////////////////////////////
    // Reinsert surface prop
    //////////////////////////////
    InsertSlice(sol4,A,0,0);
    //////////////////////////////
    // Convert from y back to x 
    //////////////////////////////
    _Matrix.P(A,sol5);
  }
 };
 }
 }
--- a/Grid/qcd/action/fermion/StaggeredKernels.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernels.cc
@@ -1,294 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 namespace Grid {
 namespace QCD {
 int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
 int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
      permute(chi,  in._odata[SE->_offset], ptype);		\
    } else {							\
      chi_p = &in._odata[SE->_offset];				\
    }								\
  } else {							\
    chi_p = &buf[SE->_offset];					\
  }								\
  multLink(Uchi, U._odata[sU], *chi_p, Dir);			
 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
    if (SE->_permute) {						\
      chi_p = &chi;						\
      permute(chi,  in._odata[SE->_offset], ptype);		\
    } else {							\
      chi_p = &in._odata[SE->_offset];				\
    }								\
  } else if ( st.same_node[Dir] ) {				\
    chi_p = &buf[SE->_offset];					\
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
  }
 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    chi_p = &buf[SE->_offset];					\
    multLink(Uchi, U._odata[sU], *chi_p, Dir);			\
  }
 template <class Impl>
 StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////////////////////////////////////////////
 // Generic implementation; move to different file?
 // Int, Ext, Int+Ext cases for comms overlap
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
 					     DoubledGaugeField &U, DoubledGaugeField &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
 					     const FermionField &in, FermionField &out, int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew;
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
    skew=8;
    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
    if ( dag ) { 
      Uchi = - Uchi;
    } 
    vstream(out._odata[sF], Uchi);
  }
 };
  ///////////////////////////////////////////////////
  // Only contributions from interior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
 						DoubledGaugeField &U, DoubledGaugeField &UUU,
 						SiteSpinor *buf, int LLs, int sU, 
 						const FermionField &in, FermionField &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew ;
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
    Uchi=zero;
    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
    skew=8;
    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
    if ( dag ) {
      Uchi = - Uchi;
    }
    vstream(out._odata[sF], Uchi);
  }
 };
  ///////////////////////////////////////////////////
  // Only contributions from exterior of our node
  ///////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
 						DoubledGaugeField &U, DoubledGaugeField &UUU,
 						SiteSpinor *buf, int LLs, int sU,
 						const FermionField &in, FermionField &out,int dag) {
  const SiteSpinor *chi_p;
  SiteSpinor chi;
  SiteSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  int skew ;
  for(int s=0;s<LLs;s++){
    int sF=LLs*sU+s;
    skew = 0;
    Uchi=zero;
    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
    skew=8;
    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
    if ( nmu ) { 
      if ( dag ) { 
 	out._odata[sF] = out._odata[sF] - Uchi;
      } else { 
 	out._odata[sF] = out._odata[sF] + Uchi;
      }
    }
  }
 };
 ////////////////////////////////////////////////////////////////////////////////////
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
 					 SiteSpinor *buf, int LLs, int sU,
 					 const FermionField &in, FermionField &out,
 					 int interior,int exterior)
 {
  int dag=1;
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
 				      SiteSpinor *buf, int LLs, int sU,
 				      const FermionField &in, FermionField &out,
 				      int interior,int exterior)
 {
  int dag=0;
  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
 				      SiteSpinor *buf, int LLs,
 				      int sU, const FermionField &in, FermionField &out,
 				      int dag,int interior,int exterior) 
 {
  switch(Opt) {
 #ifdef AVX512
  case OptInlineAsm:
    if ( interior && exterior ) {
      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else { 
      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
      assert(0);
    }
    break;
 #endif
  case OptHandUnroll:
    if ( interior && exterior ) {
      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( interior ) {
      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( exterior ) {
      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    }
    break;
  case OptGeneric:
    if ( interior && exterior ) {
      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( interior ) {
      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    } else if ( exterior ) {
      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
    }
    break;
  default:
    std::cout<<"Oops Opt = "<<Opt<<std::endl;
    assert(0);
    break;
  }
 };
 template <class Impl>
 void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,  DoubledGaugeField &UUU, SiteSpinor *buf, int sF,
 				      int sU, const FermionField &in, FermionField &out, int dir, int disp) 
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
  // Because we work out pU . dS/dU 
  // U
  assert(0);
 }
 FermOpStaggeredTemplateInstantiate(StaggeredKernels);
 FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
 }}
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -1,122 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/StaggeredKernels.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi, Peter Boyle
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_QCD_STAGGERED_KERNELS_H
 #define GRID_QCD_STAGGERED_KERNELS_H
 namespace Grid {
 namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Helper routines that implement Staggered stencil for a single site.
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 class StaggeredKernelsStatic { 
 public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
  enum { CommsAndCompute, CommsThenCompute };
  static int Opt;
  static int Comms;
 };
 template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic { 
 public:
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
 public:
   void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
 		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
   ///////////////////////////////////////////////////////////////////////////////////////
   // Generic Nc kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
 			DoubledGaugeField &U, DoubledGaugeField &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionField &in, FermionField &out,int dag);
   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
 			   DoubledGaugeField &U, DoubledGaugeField &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionField &in, FermionField &out,int dag);
   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
 			   DoubledGaugeField &U, DoubledGaugeField &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
 			   const FermionField &in, FermionField &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
 		     DoubledGaugeField &U,DoubledGaugeField &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
 		     const FermionField &in, FermionField &out,int dag);
   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
 			DoubledGaugeField &U,DoubledGaugeField &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionField &in, FermionField &out,int dag);
   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
 			DoubledGaugeField &U,DoubledGaugeField &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
 			const FermionField &in, FermionField &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 		    DoubledGaugeField &U,DoubledGaugeField &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
 		    const FermionField &in, FermionField &out,int dag);
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   // Generic interface; fan out to right routine
   ///////////////////////////////////////////////////////////////////////////////////////////////////
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
 		 const FermionField &in, FermionField &out, int interior=1,int exterior=1);
   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
 		    DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		    SiteSpinor * buf, int LLs, int sU,
 		    const FermionField &in, FermionField &out, int interior=1,int exterior=1);
   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
 		 DoubledGaugeField &U, DoubledGaugeField &UUU, 
 		 SiteSpinor * buf, int LLs, int sU,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
 public:
  StaggeredKernels(const ImplParams &p = ImplParams());
 };
 }}
 #endif
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -1,399 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #define LOAD_CHI(b)		\
  const SiteSpinor & ref (b[offset]);	\
    Chi_0=ref()()(0);\
    Chi_1=ref()()(1);\
    Chi_2=ref()()(2);
 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U._odata[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));      \
   Impl::loadLinkElement(U_10,ref()(1,0));      \
   Impl::loadLinkElement(U_20,ref()(2,0));      \
   Impl::loadLinkElement(U_01,ref()(0,1));      \
   Impl::loadLinkElement(U_11,ref()(1,1));      \
   Impl::loadLinkElement(U_21,ref()(2,1));      \
   Impl::loadLinkElement(U_02,ref()(0,2));     \
   Impl::loadLinkElement(U_12,ref()(1,2));     \
   Impl::loadLinkElement(U_22,ref()(2,2));     \
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
    UChi ## _0 += U_01*Chi_1;\
    UChi ## _1 += U_11*Chi_1;\
    UChi ## _2 += U_21*Chi_1;\
    UChi ## _0 += U_02*Chi_2;\
    UChi ## _1 += U_12*Chi_2;\
    UChi ## _2 += U_22*Chi_2;
 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U._odata[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));      \
   Impl::loadLinkElement(U_10,ref()(1,0));      \
   Impl::loadLinkElement(U_20,ref()(2,0));      \
   Impl::loadLinkElement(U_01,ref()(0,1));      \
   Impl::loadLinkElement(U_11,ref()(1,1));      \
   Impl::loadLinkElement(U_21,ref()(2,1));      \
   Impl::loadLinkElement(U_02,ref()(0,2));     \
   Impl::loadLinkElement(U_12,ref()(1,2));     \
   Impl::loadLinkElement(U_22,ref()(2,2));     \
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
    UChi ## _0 += U_01*Chi_1;\
    UChi ## _1 += U_11*Chi_1;\
    UChi ## _2 += U_21*Chi_1;\
    UChi ## _0 += U_02*Chi_2;\
    UChi ## _1 += U_12*Chi_2;\
    UChi ## _2 += U_22*Chi_2;
 #define PERMUTE_DIR(dir)			\
  permute##dir(Chi_0,Chi_0);			\
  permute##dir(Chi_1,Chi_1);			\
  permute##dir(Chi_2,Chi_2);
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
    LOAD_CHI(in._odata);					\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
    LOAD_CHI(buf);						\
  }								
 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
  {								\
    MULT(Dir,even);						\
  }
 #define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\
  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
  {								\
    MULT_ADD(U,Dir,even);					\
  }
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
    LOAD_CHI(in._odata);				\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
    LOAD_CHI(buf);					\
  }							\
  if (SE->_is_local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
  }
 #define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
    { LOAD_CHI(buf);	  }					\
    { MULT_ADD(U,Dir,even); }					\
  }								
 namespace Grid {
 namespace QCD {
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U,DoubledGaugeField &UUU,
 					  SiteSpinor *buf, int LLs, int sU, 
 					  const FermionField &in, FermionField &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  Simd even_0; // 12 regs on knc
  Simd even_1;
  Simd even_2;
  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
  Simd Chi_0;    // two spinor; 6 regs
  Simd Chi_1;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int skew;
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;
    skew = 0;
    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
    HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);   
    HAND_STENCIL_LEG      (U,Zp,1,skew,even);  
    HAND_STENCIL_LEG      (U,Tp,0,skew,odd);  
    HAND_STENCIL_LEG      (U,Xm,3,skew,even);  
    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
    skew = 8;
    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
    HAND_STENCIL_LEG(UUU,Zp,1,skew,even);  
    HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);  
    HAND_STENCIL_LEG(UUU,Xm,3,skew,even);  
    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
      result()()(1) = - even_1 - odd_1;
      result()()(2) = - even_2 - odd_2;
    } else { 
      result()()(0) = even_0 + odd_0;
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
    vstream(out._odata[sF],result);
  }
 }
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
 					     DoubledGaugeField &U, DoubledGaugeField &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
 					     const FermionField &in, FermionField &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  Simd even_0; // 12 regs on knc
  Simd even_1;
  Simd even_2;
  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
  Simd Chi_0;    // two spinor; 6 regs
  Simd Chi_1;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int skew;
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;
    even_0 = zero;    even_1 = zero;    even_2 = zero;
     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
    HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);   
    HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);  
    HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);  
    HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);  
    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
    skew = 8;
    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
    HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);  
    HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
    if ( dag ) {
      result()()(0) = - even_0 - odd_0;
      result()()(1) = - even_1 - odd_1;
      result()()(2) = - even_2 - odd_2;
    } else { 
      result()()(0) = even_0 + odd_0;
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
    vstream(out._odata[sF],result);
  }
 }
 template <class Impl>
 void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
 					     DoubledGaugeField &U, DoubledGaugeField &UUU,
 					     SiteSpinor *buf, int LLs, int sU, 
 					     const FermionField &in, FermionField &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  Simd even_0; // 12 regs on knc
  Simd even_1;
  Simd even_2;
  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
  Simd Chi_0;    // two spinor; 6 regs
  Simd Chi_1;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int skew;
  for(int s=0;s<LLs;s++){
    int sF=s+LLs*sU;
    even_0 = zero;    even_1 = zero;    even_2 = zero;
     odd_0 = zero;     odd_1 = zero;     odd_2 = zero;
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);  
    HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
    skew = 8;
    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);  
    HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
    // Add sum of all exterior connected stencil legs
    if ( nmu ) { 
      if ( dag ) {
 	result()()(0) = - even_0 - odd_0;
 	result()()(1) = - even_1 - odd_1;
 	result()()(2) = - even_2 - odd_2;
      } else { 
 	result()()(0) = even_0 + odd_0;
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
      out._odata[sF] = out._odata[sF] + result;
    }
  }
 }
 #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionField &in, FermionField &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionField &in, FermionField &out, int dag); \
 									\
  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \
 						     SiteSpinor *buf, int LLs, int sU, \
 						     const FermionField &in, FermionField &out, int dag); \
 DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
 DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
 DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
 DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
 }
 }
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.cc
@@ -1,243 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 //#include <Grid/Eigen/Dense>
 #include <Grid/qcd/spin/Dirac.h>
 namespace Grid
 {
 namespace QCD
 {
 // *NOT* EO
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 {
  FermionField temp(out._grid);
  // Wilson term
  out.checkerboard = in.checkerboard;
  this->Dhop(in, out, DaggerNo);
  // Clover term
  Mooee(in, temp);
  out += temp;
  return norm2(out);
 }
 template <class Impl>
 RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 {
  FermionField temp(out._grid);
  // Wilson term
  out.checkerboard = in.checkerboard;
  this->Dhop(in, out, DaggerYes);
  // Clover term
  MooeeDag(in, temp);
  out += temp;
  return norm2(out);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 {
  WilsonFermion<Impl>::ImportGauge(_Umu);
  GridBase *grid = _Umu._grid;
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  // Compute the field strength terms mu>nu
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
  // Compute the Clover Operator acting on Colour and Spin
  // multiply here by the clover coefficients for the anisotropy
  CloverTerm  = fillCloverYZ(Bx) * csw_r;
  CloverTerm += fillCloverXZ(By) * csw_r;
  CloverTerm += fillCloverXY(Bz) * csw_r;
  CloverTerm += fillCloverXT(Ex) * csw_t;
  CloverTerm += fillCloverYT(Ey) * csw_t;
  CloverTerm += fillCloverZT(Ez) * csw_t;
  CloverTerm += diag_mass;
  int lvol = _Umu._grid->lSites();
  int DimRep = Impl::Dimension;
  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
  std::vector<int> lcoor;
  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
  for (int site = 0; site < lvol; site++)
  {
    grid->LocalIndexToLocalCoor(site, lcoor);
    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
    peekLocalSite(Qx, CloverTerm, lcoor);
    Qxinv = zero;
    //if (csw!=0){
    for (int j = 0; j < Ns; j++)
      for (int k = 0; k < Ns; k++)
        for (int a = 0; a < DimRep; a++)
          for (int b = 0; b < DimRep; b++)
            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
    EigenInvCloverOp = EigenCloverOp.inverse();
    //std::cout << EigenInvCloverOp << std::endl;
    for (int j = 0; j < Ns; j++)
      for (int k = 0; k < Ns; k++)
        for (int a = 0; a < DimRep; a++)
          for (int b = 0; b < DimRep; b++)
            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
    //  }
    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
  }
  // Separate the even and odd parts
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 {
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 }
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 {
  out.checkerboard = in.checkerboard;
  CloverFieldType *Clover;
  assert(in.checkerboard == Odd || in.checkerboard == Even);
  if (dag)
  {
    if (in._grid->_isCheckerBoarded)
    {
      if (in.checkerboard == Odd)
      {
        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
      }
      else
      {
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
      }
      out = *Clover * in;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
      out = adj(*Clover) * in;
    }
  }
  else
  {
    if (in._grid->_isCheckerBoarded)
    {
      if (in.checkerboard == Odd)
      {
        //  std::cout << "Calling clover term Odd" << std::endl;
        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
      }
      else
      {
        //  std::cout << "Calling clover term Even" << std::endl;
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
      }
      out = *Clover * in;
      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
    }
    else
    {
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
      out = *Clover * in;
    }
  }
 } // MooeeInternal
 // Derivative parts
 template <class Impl>
 void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 {
  assert(0);
 }
 // Derivative parts
 template <class Impl>
 void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 {
  assert(0); // not implemented yet
 }
 FermOpTemplateInstantiate(WilsonCloverFermion);
 AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
 TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
 //GparityFermOpTemplateInstantiate(WilsonCloverFermion);
 }
 }
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -1,367 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
    Copyright (C) 2017
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    Author: David Preti <>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
 #define GRID_QCD_WILSON_CLOVER_FERMION_H
 #include <Grid/Grid.h>
 namespace Grid
 {
 namespace QCD
 {
 ///////////////////////////////////////////////////////////////////
 // Wilson Clover
 //
 // Operator ( with anisotropy coefficients):
 //
 // Q =   1 + (Nd-1)/xi_0 + m
 //     + W_t + (nu/xi_0) * W_s
 //     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
 //
 // s spatial, t temporal directions.
 // where W_t and W_s are the temporal and spatial components of the
 // Wilson Dirac operator
 //
 // csw_r = csw_t to recover the isotropic version
 //////////////////////////////////////////////////////////////////
 template <class Impl>
 class WilsonCloverFermion : public WilsonFermion<Impl>
 {
 public:
  // Types definitions
  INHERIT_IMPL_TYPES(Impl);
  template <typename vtype>
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef iImplClover<Simd> SiteCloverType;
  typedef Lattice<SiteCloverType> CloverFieldType;
 public:
  typedef WilsonFermion<Impl> WilsonBase;
  virtual int    ConstEE(void)     { return 0; };
  virtual void Instantiatable(void){};
  // Constructors
  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                      GridRedBlackCartesian &Hgrid,
                      const RealD _mass,
                      const RealD _csw_r = 0.0,
                      const RealD _csw_t = 0.0,
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
                                                                                     Fgrid,
                                                                                     Hgrid,
                                                                                     _mass, impl_p, clover_anisotropy),
                                                                 CloverTerm(&Fgrid),
                                                                 CloverTermInv(&Fgrid),
                                                                 CloverTermEven(&Hgrid),
                                                                 CloverTermOdd(&Hgrid),
                                                                 CloverTermInvEven(&Hgrid),
                                                                 CloverTermInvOdd(&Hgrid),
                                                                 CloverTermDagEven(&Hgrid),
                                                                 CloverTermDagOdd(&Hgrid),
                                                                 CloverTermInvDagEven(&Hgrid),
                                                                 CloverTermInvDagOdd(&Hgrid)
  {
    assert(Nd == 4); // require 4 dimensions
    if (clover_anisotropy.isAnisotropic)
    {
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
    }
    else
    {
      csw_r = _csw_r * 0.5;
      diag_mass = 4.0 + _mass;
    }
    csw_t = _csw_t * 0.5;
    if (csw_r == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
    if (csw_t == 0)
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
    ImportGauge(_Umu);
  }
  virtual RealD M(const FermionField &in, FermionField &out);
  virtual RealD Mdag(const FermionField &in, FermionField &out);
  virtual void Mooee(const FermionField &in, FermionField &out);
  virtual void MooeeDag(const FermionField &in, FermionField &out);
  virtual void MooeeInv(const FermionField &in, FermionField &out);
  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
  void ImportGauge(const GaugeField &_Umu);
  // Derivative parts unpreconditioned pseudofermions
  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
  {
    conformable(X._grid, Y._grid);
    conformable(X._grid, force._grid);
    GaugeLinkField force_mu(force._grid), lambda(force._grid);
    GaugeField clover_force(force._grid);
    PropagatorField Lambda(force._grid);
    // Guido: Here we are hitting some performance issues:
    // need to extract the components of the DoubledGaugeField
    // for each call
    // Possible solution
    // Create a vector object to store them? (cons: wasting space)
    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
    Impl::extractLinkField(U, this->Umu);
    force = zero;
    // Derivative of the Wilson hopping term
    this->DhopDeriv(force, X, Y, dag);
    ///////////////////////////////////////////////////////////
    // Clover term derivative
    ///////////////////////////////////////////////////////////
    Impl::outerProductImpl(Lambda, X, Y);
    //std::cout << "Lambda:" << Lambda << std::endl;
    Gamma::Algebra sigma[] = {
        Gamma::Algebra::SigmaXY,
        Gamma::Algebra::SigmaXZ,
        Gamma::Algebra::SigmaXT,
        Gamma::Algebra::MinusSigmaXY,
        Gamma::Algebra::SigmaYZ,
        Gamma::Algebra::SigmaYT,
        Gamma::Algebra::MinusSigmaXZ,
        Gamma::Algebra::MinusSigmaYZ,
        Gamma::Algebra::SigmaZT,
        Gamma::Algebra::MinusSigmaXT,
        Gamma::Algebra::MinusSigmaYT,
        Gamma::Algebra::MinusSigmaZT};
    /*
      sigma_{\mu \nu}=
      | 0         sigma[0]  sigma[1]  sigma[2] |
      | sigma[3]    0       sigma[4]  sigma[5] |
      | sigma[6]  sigma[7]     0      sigma[8] |
      | sigma[9]  sigma[10] sigma[11]   0      |
    */
    int count = 0;
    clover_force = zero;
    for (int mu = 0; mu < 4; mu++)
    {
      force_mu = zero;
      for (int nu = 0; nu < 4; nu++)
      {
        if (mu == nu)
        continue;
        RealD factor;
        if (nu == 4 || mu == 4)
        {
          factor = 2.0 * csw_t;
        }
        else
        {
          factor = 2.0 * csw_r;
        }
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
        count++;
      }
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
    }
    //clover_force *= csw;
    force += clover_force;
  }
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
  {
    conformable(lambda._grid, U[0]._grid);
    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
    // insertion in upper staple
    // please check redundancy of shift operations
    // C1+
    tmp = lambda * U[nu];
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C2+
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
    // C3+
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
    // C4+
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
    // insertion in lower staple
    // C1-
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C2-
    tmp = adj(lambda) * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    // C3-
    tmp = lambda * U[nu];
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
    // C4-
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
    return out;
  }
 private:
  // here fixing the 4 dimensions, make it more general?
  RealD csw_r;                                               // Clover coefficient - spatial
  RealD csw_t;                                               // Clover coefficient - temporal
  RealD diag_mass;                                           // Mass term
  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = -F._odata[i]()();
      T._odata[i]()(1, 0) = F._odata[i]()();
      T._odata[i]()(2, 3) = -F._odata[i]()();
      T._odata[i]()(3, 2) = F._odata[i]()();
    }
    return T;
  }
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 1) = -(F._odata[i]()());
      T._odata[i]()(1, 0) = (F._odata[i]()());
      T._odata[i]()(2, 3) = (F._odata[i]()());
      T._odata[i]()(3, 2) = -(F._odata[i]()());
    }
    return T;
  }
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
  {
    CloverFieldType T(F._grid);
    T = zero;
    PARALLEL_FOR_LOOP
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
    {
      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
    }
    return T;
  }
 };
 }
 }
 #endif // GRID_QCD_WILSON_CLOVER_FERMION_H
--- a/Grid/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHand.cc
@@ -1,631 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #define REGISTER
 #define LOAD_CHIMU \
  {const SiteSpinor & ref (in._odata[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
    Chimu_02=ref()(0)(2);\
    Chimu_10=ref()(1)(0);\
    Chimu_11=ref()(1)(1);\
    Chimu_12=ref()(1)(2);\
    Chimu_20=ref()(2)(0);\
    Chimu_21=ref()(2)(1);\
    Chimu_22=ref()(2)(2);\
    Chimu_30=ref()(3)(0);\
    Chimu_31=ref()(3)(1);\
    Chimu_32=ref()(3)(2);}
 #define LOAD_CHI\
  {const SiteHalfSpinor &ref(buf[offset]);	\
    Chi_00 = ref()(0)(0);\
    Chi_01 = ref()(0)(1);\
    Chi_02 = ref()(0)(2);\
    Chi_10 = ref()(1)(0);\
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);}
 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
  {auto & ref(U._odata[sU](A));			\
   Impl::loadLinkElement(U_00,ref()(0,0));	\
   Impl::loadLinkElement(U_10,ref()(1,0));	\
   Impl::loadLinkElement(U_20,ref()(2,0));	\
   Impl::loadLinkElement(U_01,ref()(0,1));	\
   Impl::loadLinkElement(U_11,ref()(1,1));	\
   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
    UChi_11 = U_10*Chi_10;\
    UChi_02 = U_20*Chi_00;\
    UChi_12 = U_20*Chi_10;\
    UChi_00+= U_01*Chi_01;\
    UChi_10+= U_01*Chi_11;\
    UChi_01+= U_11*Chi_01;\
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
    Impl::loadLinkElement(U_00,ref()(0,2));	\
    Impl::loadLinkElement(U_10,ref()(1,2));	\
    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
    UChi_11+= U_10*Chi_12;\
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;}
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
      permute##dir(Chi_02,Chi_02);\
      permute##dir(Chi_10,Chi_10);\
      permute##dir(Chi_11,Chi_11);\
      permute##dir(Chi_12,Chi_12);
 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
 #define XP_PROJ \
    Chi_00 = Chimu_00+timesI(Chimu_30);\
    Chi_01 = Chimu_01+timesI(Chimu_31);\
    Chi_02 = Chimu_02+timesI(Chimu_32);\
    Chi_10 = Chimu_10+timesI(Chimu_20);\
    Chi_11 = Chimu_11+timesI(Chimu_21);\
    Chi_12 = Chimu_12+timesI(Chimu_22);
 #define YP_PROJ \
    Chi_00 = Chimu_00-Chimu_30;\
    Chi_01 = Chimu_01-Chimu_31;\
    Chi_02 = Chimu_02-Chimu_32;\
    Chi_10 = Chimu_10+Chimu_20;\
    Chi_11 = Chimu_11+Chimu_21;\
    Chi_12 = Chimu_12+Chimu_22;
 #define ZP_PROJ \
  Chi_00 = Chimu_00+timesI(Chimu_20);		\
  Chi_01 = Chimu_01+timesI(Chimu_21);		\
  Chi_02 = Chimu_02+timesI(Chimu_22);		\
  Chi_10 = Chimu_10-timesI(Chimu_30);		\
  Chi_11 = Chimu_11-timesI(Chimu_31);		\
  Chi_12 = Chimu_12-timesI(Chimu_32);
 #define TP_PROJ \
  Chi_00 = Chimu_00+Chimu_20;		\
  Chi_01 = Chimu_01+Chimu_21;		\
  Chi_02 = Chimu_02+Chimu_22;		\
  Chi_10 = Chimu_10+Chimu_30;		\
  Chi_11 = Chimu_11+Chimu_31;		\
  Chi_12 = Chimu_12+Chimu_32;
 //      hspin(0)=fspin(0)-timesI(fspin(3));
 //      hspin(1)=fspin(1)-timesI(fspin(2));
 #define XM_PROJ \
    Chi_00 = Chimu_00-timesI(Chimu_30);\
    Chi_01 = Chimu_01-timesI(Chimu_31);\
    Chi_02 = Chimu_02-timesI(Chimu_32);\
    Chi_10 = Chimu_10-timesI(Chimu_20);\
    Chi_11 = Chimu_11-timesI(Chimu_21);\
    Chi_12 = Chimu_12-timesI(Chimu_22);
 #define YM_PROJ \
    Chi_00 = Chimu_00+Chimu_30;\
    Chi_01 = Chimu_01+Chimu_31;\
    Chi_02 = Chimu_02+Chimu_32;\
    Chi_10 = Chimu_10-Chimu_20;\
    Chi_11 = Chimu_11-Chimu_21;\
    Chi_12 = Chimu_12-Chimu_22;
 #define ZM_PROJ \
  Chi_00 = Chimu_00-timesI(Chimu_20);		\
  Chi_01 = Chimu_01-timesI(Chimu_21);		\
  Chi_02 = Chimu_02-timesI(Chimu_22);		\
  Chi_10 = Chimu_10+timesI(Chimu_30);		\
  Chi_11 = Chimu_11+timesI(Chimu_31);		\
  Chi_12 = Chimu_12+timesI(Chimu_32);
 #define TM_PROJ \
  Chi_00 = Chimu_00-Chimu_20;		\
  Chi_01 = Chimu_01-Chimu_21;		\
  Chi_02 = Chimu_02-Chimu_22;		\
  Chi_10 = Chimu_10-Chimu_30;		\
  Chi_11 = Chimu_11-Chimu_31;		\
  Chi_12 = Chimu_12-Chimu_32;
 //      fspin(0)=hspin(0);
 //      fspin(1)=hspin(1);
 //      fspin(2)=timesMinusI(hspin(1));
 //      fspin(3)=timesMinusI(hspin(0));
 #define XP_RECON\
  result_00 = UChi_00;\
  result_01 = UChi_01;\
  result_02 = UChi_02;\
  result_10 = UChi_10;\
  result_11 = UChi_11;\
  result_12 = UChi_12;\
  result_20 = timesMinusI(UChi_10);\
  result_21 = timesMinusI(UChi_11);\
  result_22 = timesMinusI(UChi_12);\
  result_30 = timesMinusI(UChi_00);\
  result_31 = timesMinusI(UChi_01);\
  result_32 = timesMinusI(UChi_02);
 #define XP_RECON_ACCUM\
  result_00+=UChi_00;\
  result_01+=UChi_01;\
  result_02+=UChi_02;\
  result_10+=UChi_10;\
  result_11+=UChi_11;\
  result_12+=UChi_12;\
  result_20-=timesI(UChi_10);\
  result_21-=timesI(UChi_11);\
  result_22-=timesI(UChi_12);\
  result_30-=timesI(UChi_00);\
  result_31-=timesI(UChi_01);\
  result_32-=timesI(UChi_02);
 #define XM_RECON\
  result_00 = UChi_00;\
  result_01 = UChi_01;\
  result_02 = UChi_02;\
  result_10 = UChi_10;\
  result_11 = UChi_11;\
  result_12 = UChi_12;\
  result_20 = timesI(UChi_10);\
  result_21 = timesI(UChi_11);\
  result_22 = timesI(UChi_12);\
  result_30 = timesI(UChi_00);\
  result_31 = timesI(UChi_01);\
  result_32 = timesI(UChi_02);
 #define XM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= timesI(UChi_10);\
  result_21+= timesI(UChi_11);\
  result_22+= timesI(UChi_12);\
  result_30+= timesI(UChi_00);\
  result_31+= timesI(UChi_01);\
  result_32+= timesI(UChi_02);
 #define YP_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= UChi_10;\
  result_21+= UChi_11;\
  result_22+= UChi_12;\
  result_30-= UChi_00;\
  result_31-= UChi_01;\
  result_32-= UChi_02;
 #define YM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20-= UChi_10;\
  result_21-= UChi_11;\
  result_22-= UChi_12;\
  result_30+= UChi_00;\
  result_31+= UChi_01;\
  result_32+= UChi_02;
 #define ZP_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20-= timesI(UChi_00);			\
  result_21-= timesI(UChi_01);			\
  result_22-= timesI(UChi_02);			\
  result_30+= timesI(UChi_10);			\
  result_31+= timesI(UChi_11);			\
  result_32+= timesI(UChi_12);
 #define ZM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= timesI(UChi_00);			\
  result_21+= timesI(UChi_01);			\
  result_22+= timesI(UChi_02);			\
  result_30-= timesI(UChi_10);			\
  result_31-= timesI(UChi_11);			\
  result_32-= timesI(UChi_12);
 #define TP_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20+= UChi_00;			\
  result_21+= UChi_01;			\
  result_22+= UChi_02;			\
  result_30+= UChi_10;			\
  result_31+= UChi_11;			\
  result_32+= UChi_12;
 #define TM_RECON_ACCUM\
  result_00+= UChi_00;\
  result_01+= UChi_01;\
  result_02+= UChi_02;\
  result_10+= UChi_10;\
  result_11+= UChi_11;\
  result_12+= UChi_12;\
  result_20-= UChi_00;	\
  result_21-= UChi_01;	\
  result_22-= UChi_02;	\
  result_30-= UChi_10;	\
  result_31-= UChi_11;	\
  result_32-= UChi_12;
 #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
    LOAD_CHIMU;					\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else {					\
    LOAD_CHI;					\
  }						\
  MULT_2SPIN(DIR);				\
  RECON;					
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
    LOAD_CHIMU;					\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
    }						\
  } else if ( st.same_node[DIR] ) {		\
    LOAD_CHI;					\
  }						\
  if (local || st.same_node[DIR] ) {		\
    MULT_2SPIN(DIR);				\
    RECON;					\
  }
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
    LOAD_CHI;					\
    MULT_2SPIN(DIR);				\
    RECON;					\
    nmu++;					\
  }
 #define HAND_RESULT(ss)				\
  {						\
    SiteSpinor & ref (out._odata[ss]);		\
    vstream(ref()(0)(0),result_00);		\
    vstream(ref()(0)(1),result_01);		\
    vstream(ref()(0)(2),result_02);		\
    vstream(ref()(1)(0),result_10);		\
    vstream(ref()(1)(1),result_11);		\
    vstream(ref()(1)(2),result_12);		\
    vstream(ref()(2)(0),result_20);		\
    vstream(ref()(2)(1),result_21);		\
    vstream(ref()(2)(2),result_22);		\
    vstream(ref()(3)(0),result_30);		\
    vstream(ref()(3)(1),result_31);		\
    vstream(ref()(3)(2),result_32);		\
  }
 #define HAND_RESULT_EXT(ss)			\
  if (nmu){					\
    SiteSpinor & ref (out._odata[ss]);		\
    ref()(0)(0)+=result_00;		\
    ref()(0)(1)+=result_01;		\
    ref()(0)(2)+=result_02;		\
    ref()(1)(0)+=result_10;		\
    ref()(1)(1)+=result_11;		\
    ref()(1)(2)+=result_12;		\
    ref()(2)(0)+=result_20;		\
    ref()(2)(1)+=result_21;		\
    ref()(2)(2)+=result_22;		\
    ref()(3)(0)+=result_30;		\
    ref()(3)(1)+=result_31;		\
    ref()(3)(2)+=result_32;		\
  }
 #define HAND_DECLARATIONS(a)			\
  Simd result_00;				\
  Simd result_01;				\
  Simd result_02;				\
  Simd result_10;				\
  Simd result_11;				\
  Simd result_12;				\
  Simd result_20;				\
  Simd result_21;				\
  Simd result_22;				\
  Simd result_30;				\
  Simd result_31;				\
  Simd result_32;				\
  Simd Chi_00;					\
  Simd Chi_01;					\
  Simd Chi_02;					\
  Simd Chi_10;					\
  Simd Chi_11;					\
  Simd Chi_12;					\
  Simd UChi_00;					\
  Simd UChi_01;					\
  Simd UChi_02;					\
  Simd UChi_10;					\
  Simd UChi_11;					\
  Simd UChi_12;					\
  Simd U_00;					\
  Simd U_10;					\
  Simd U_20;					\
  Simd U_01;					\
  Simd U_11;					\
  Simd U_21;
 #define ZERO_RESULT				\
  result_00=zero;				\
  result_01=zero;				\
  result_02=zero;				\
  result_10=zero;				\
  result_11=zero;				\
  result_12=zero;				\
  result_20=zero;				\
  result_21=zero;				\
  result_22=zero;				\
  result_30=zero;				\
  result_31=zero;				\
  result_32=zero;			
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
 #define Chimu_02 Chi_02
 #define Chimu_10 Chi_10
 #define Chimu_11 Chi_11
 #define Chimu_12 Chi_12
 #define Chimu_20 UChi_00
 #define Chimu_21 UChi_01
 #define Chimu_22 UChi_02
 #define Chimu_30 UChi_10
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
 namespace Grid {
 namespace QCD {
 template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionField &in, FermionField &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionField &in, FermionField &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT(ss);
 }
 template<class Impl> void 
 WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionField &in, FermionField &out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
  HAND_RESULT_EXT(ss);
 }
 template<class Impl>
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  HAND_DECLARATIONS(ignore);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
  HAND_RESULT_EXT(ss);
 }
 ////////////// Wilson ; uses this implementation /////////////////////
 #define INSTANTIATE_THEM(A) \
 template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 					     int ss,int sU,const FermionField &in, FermionField &out); \
 template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 						int ss,int sU,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 						int ss,int sU,const FermionField &in, FermionField &out); \
 template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 						   int ss,int sU,const FermionField &in, FermionField &out); \
 template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 						int ss,int sU,const FermionField &in, FermionField &out); \
 template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 						   int ss,int sU,const FermionField &in, FermionField &out); 
 INSTANTIATE_THEM(WilsonImplF);
 INSTANTIATE_THEM(WilsonImplD);
 INSTANTIATE_THEM(ZWilsonImplF);
 INSTANTIATE_THEM(ZWilsonImplD);
 INSTANTIATE_THEM(DomainWallVec5dImplF);
 INSTANTIATE_THEM(DomainWallVec5dImplD);
 INSTANTIATE_THEM(ZDomainWallVec5dImplF);
 INSTANTIATE_THEM(ZDomainWallVec5dImplD);
 INSTANTIATE_THEM(WilsonImplFH);
 INSTANTIATE_THEM(WilsonImplDF);
 INSTANTIATE_THEM(ZWilsonImplFH);
 INSTANTIATE_THEM(ZWilsonImplDF);
 INSTANTIATE_THEM(DomainWallVec5dImplFH);
 INSTANTIATE_THEM(DomainWallVec5dImplDF);
 INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
 INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
 INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
 INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
 }}
--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -1,155 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonTMFermion5D.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #pragma once 
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>
 namespace Grid {
  namespace QCD {
    template<class Impl>
      class WilsonTMFermion5D : public WilsonFermion5D<Impl>
      {
      public:
 	INHERIT_IMPL_TYPES(Impl);
      public:
 	virtual void   Instantiatable(void) {};
 	// Constructors
        WilsonTMFermion5D(GaugeField &_Umu,
 			  GridCartesian         &Fgrid,
 			  GridRedBlackCartesian &Frbgrid, 
 			  GridCartesian         &Ugrid,
 			  GridRedBlackCartesian &Urbgrid, 
 			  const std::vector<RealD> _mass,
 			  const std::vector<RealD> _mu,
 			  const ImplParams &p= ImplParams()
 			  ) :
 	WilsonFermion5D<Impl>(_Umu,
 			      Fgrid,
 			      Frbgrid,
 			      Ugrid,
 			      Urbgrid,
 			      4.0,p)
 	  {
 	    update(_mass,_mu);
 	  }
 	virtual void Meooe(const FermionField &in, FermionField &out) {
 	  if (in.checkerboard == Odd) {
 	    this->DhopEO(in, out, DaggerNo);
 	  } else {
 	    this->DhopOE(in, out, DaggerNo);
 	  }
 	}
 	virtual void MeooeDag(const FermionField &in, FermionField &out) {
 	  if (in.checkerboard == Odd) {
 	    this->DhopEO(in, out, DaggerYes);
 	  } else {
 	    this->DhopOE(in, out, DaggerYes);
 	  }
 	}	
 	// allow override for twisted mass and clover
 	virtual void Mooee(const FermionField &in, FermionField &out) {
 	  out.checkerboard = in.checkerboard;
 	  //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    ComplexD a = 4.0+this->mass[s];
 	    ComplexD b(0.0,this->mu[s]);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual void MooeeDag(const FermionField &in, FermionField &out) {
 	  out.checkerboard = in.checkerboard;
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    ComplexD a = 4.0+this->mass[s];
 	    ComplexD b(0.0,-this->mu[s]);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual void MooeeInv(const FermionField &in, FermionField &out) {
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    RealD m    = this->mass[s];
 	    RealD tm   = this->mu[s];
 	    RealD mtil = 4.0+this->mass[s];
 	    RealD sq   = mtil*mtil+tm*tm;
 	    ComplexD a    = mtil/sq;
 	    ComplexD b(0.0, -tm /sq);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    RealD m    = this->mass[s];
 	    RealD tm   = this->mu[s];
 	    RealD mtil = 4.0+this->mass[s];
 	    RealD sq   = mtil*mtil+tm*tm;
 	    ComplexD a    = mtil/sq;
 	    ComplexD b(0.0,tm /sq);
 	    axpbg5y_ssp(out,a,in,b,in,s,s);
 	  }
 	}
 	virtual RealD M(const FermionField &in, FermionField &out) {
 	  out.checkerboard = in.checkerboard;
 	  this->Dhop(in, out, DaggerNo);
 	  FermionField tmp(out._grid);
 	  for (int s=0;s<(int)this->mass.size();s++) {
 	    ComplexD a = 4.0+this->mass[s];
 	    ComplexD b(0.0,this->mu[s]);
 	    axpbg5y_ssp(tmp,a,in,b,in,s,s);
 	  }
 	  return axpy_norm(out, 1.0, tmp, out);
 	}
 	// needed for fast PV
 	void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
 	  assert(_mass.size() == _mu.size());
 	  assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
 	  this->mass = _mass;
 	  this->mu = _mu;
 	}
      private:
 	std::vector<RealD> mu;
 	std::vector<RealD> mass;
      };
    typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
    typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
 }}
--- a/Grid/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@@ -1,331 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/gauge/Photon.h
 Copyright (C) 2015-2018
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <J.Harrison@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 namespace Grid{
 namespace QCD{
  template <class S>
  class QedGImpl
  {
  public:
    typedef S Simd;
    typedef typename Simd::scalar_type Scalar;
    template <typename vtype>
    using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>;
    template <typename vtype>
    using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
    typedef iImplGaugeLink<Simd>  SiteLink;
    typedef iImplGaugeField<Simd> SiteField;
    typedef SiteLink              SiteComplex;
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
  };
  typedef QedGImpl<vComplex> QedGImplR;
  template <class GImpl>
  class Photon
  {
  public:
    INHERIT_GIMPL_TYPES(GImpl);
    typedef typename SiteGaugeLink::scalar_object ScalarSite;
    typedef typename ScalarSite::scalar_type      ScalarComplex;
    GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
  public:
    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvement);
    Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
    void StochasticWeight(GaugeLinkField &weight);
    void StochasticField(GaugeField &out, GridParallelRNG &rng);
    void StochasticField(GaugeField &out, GridParallelRNG &rng,
                         const GaugeLinkField &weight);
    void UnitField(GaugeField &out);
  private:
    void makeSpatialNorm(LatticeInteger &spNrm);
    void makeKHat(std::vector<GaugeLinkField> &khat);
    void makeInvKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
    void transverseProjectSpatial(GaugeField &out);
    void gaugeTransform(GaugeField &out);
  private:
    GridBase          *grid_;
    Gauge             gauge_;
    ZmScheme          zmScheme_;
    std::vector<Real> improvement_;
  };
  typedef Photon<QedGImplR>  PhotonR;
  template<class GImpl>
  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme,
                        std::vector<Real> improvements)
  : grid_(grid), gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements)
  {}
  template<class GImpl>
  Photon<GImpl>::Photon(GridBase *grid, Gauge gauge, ZmScheme zmScheme)
  : Photon(grid, gauge, zmScheme, std::vector<Real>())
  {}
  template<class GImpl>
  void Photon<GImpl>::FreePropagator(const GaugeField &in, GaugeField &out)
  {
    FFT        theFFT(dynamic_cast<GridCartesian *>(grid_));
    GaugeField in_k(grid_);
    GaugeField prop_k(grid_);
    theFFT.FFT_all_dim(in_k, in, FFT::forward);
    MomentumSpacePropagator(prop_k, in_k);
    theFFT.FFT_all_dim(out, prop_k, FFT::backward);
  }
  template<class GImpl>
  void Photon<GImpl>::makeSpatialNorm(LatticeInteger &spNrm)
  {
    LatticeInteger   coor(grid_);
    std::vector<int> l = grid_->FullDimensions();
    spNrm = zero;
    for(int mu = 0; mu < grid_->Nd() - 1; mu++)
    {
      LatticeCoordinate(coor, mu);
      coor  = where(coor < Integer(l[mu]/2), coor, coor - Integer(l[mu]));
      spNrm = spNrm + coor*coor;
    }
  }
  template<class GImpl>
  void Photon<GImpl>::makeKHat(std::vector<GaugeLinkField> &khat)
  {
    const unsigned int nd = grid_->Nd();
    std::vector<int>   l  = grid_->FullDimensions();
    Complex            ci(0., 1.);
    khat.resize(nd, grid_);
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      Real piL = M_PI/l[mu];
      LatticeCoordinate(khat[mu], mu);
      khat[mu] = exp(piL*ci*khat[mu])*2.*sin(piL*khat[mu]);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::makeInvKHatSquared(GaugeLinkField &out)
  {
    std::vector<GaugeLinkField> khat;
    GaugeLinkField              lone(grid_);
    const unsigned int          nd = grid_->Nd();
    std::vector<int>            zm(nd, 0);
    ScalarSite                  one = ScalarComplex(1., 0.), z = ScalarComplex(0., 0.);
    out = zero;
    makeKHat(khat);
    for(int mu = 0; mu < nd; mu++)
    {
      out = out + khat[mu]*conjugate(khat[mu]);
    }
    lone = ScalarComplex(1., 0.);
    pokeSite(one, out, zm);
    out = lone/out;
    pokeSite(z, out, zm);
  }
  template<class GImpl>
  void Photon<GImpl>::zmSub(GaugeLinkField &out)
  {
    switch (zmScheme_)
    {
      case ZmScheme::qedTL:
      {
        std::vector<int> zm(grid_->Nd(), 0);
        ScalarSite       z = ScalarComplex(0., 0.);
        pokeSite(z, out, zm);
        break;
      }
      case ZmScheme::qedL:
      {
        LatticeInteger spNrm(grid_);
        makeSpatialNorm(spNrm);
        out = where(spNrm == Integer(0), 0.*out, out);
        for(int i = 0; i < improvement_.size(); i++)
        {
          Real f = sqrt(improvement_[i] + 1);
          out = where(spNrm == Integer(i + 1), f*out, out);
        }
        break;
      }
      default:
        assert(0);
        break;
    }
  }
  template<class GImpl>
  void Photon<GImpl>::transverseProjectSpatial(GaugeField &out)
  {
    const unsigned int          nd = grid_->Nd();
    GaugeLinkField              invKHat(grid_), cst(grid_), spdiv(grid_);
    LatticeInteger              spNrm(grid_);
    std::vector<GaugeLinkField> khat, a(nd, grid_), aProj(nd, grid_);
    invKHat = zero;
    makeSpatialNorm(spNrm);
    makeKHat(khat);
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      a[mu] = peekLorentz(out, mu);
      if (mu < nd - 1)
      {
        invKHat += khat[mu]*conjugate(khat[mu]);
      }
    }
    cst     = ScalarComplex(1., 0.);
    invKHat = where(spNrm == Integer(0), cst, invKHat);
    invKHat = cst/invKHat;
    cst     = zero;
    invKHat = where(spNrm == Integer(0), cst, invKHat);
    spdiv   = zero;
    for (unsigned int nu = 0; nu < nd - 1; ++nu)
    {
      spdiv += conjugate(khat[nu])*a[nu];
    }
    spdiv *= invKHat;
    for (unsigned int mu = 0; mu < nd; ++mu)
    {
      aProj[mu] = a[mu] - khat[mu]*spdiv;
      pokeLorentz(out, aProj[mu], mu);
    }
  }
  template<class GImpl>
  void Photon<GImpl>::gaugeTransform(GaugeField &out)
  {
    switch (gauge_)
    {
      case Gauge::feynman:
        break;
      case Gauge::coulomb:
        transverseProjectSpatial(out);
        break;
      case Gauge::landau:
        assert(0);
        break;
      default:
        assert(0);
        break;
    }
  }
  template<class GImpl>
  void Photon<GImpl>::MomentumSpacePropagator(const GaugeField &in,
                                              GaugeField &out)
  {
    LatticeComplex momProp(grid_);
    makeInvKHatSquared(momProp);
    zmSub(momProp);
    out = in*momProp;
  }
  template<class GImpl>
  void Photon<GImpl>::StochasticWeight(GaugeLinkField &weight)
  {
    const unsigned int nd  = grid_->Nd();
    std::vector<int>   l   = grid_->FullDimensions();
    Integer            vol = 1;
    for(unsigned int mu = 0; mu < nd; mu++)
    {
      vol = vol*l[mu];
    }
    makeInvKHatSquared(weight);
    weight = sqrt(vol)*sqrt(weight);
    zmSub(weight);
  }
  template<class GImpl>
  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
    GaugeLinkField weight(grid_);
    StochasticWeight(weight);
    StochasticField(out, rng, weight);
  }
  template<class GImpl>
  void Photon<GImpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
                                      const GaugeLinkField &weight)
  {
    const unsigned int nd = grid_->Nd();
    GaugeLinkField     r(grid_);
    GaugeField         aTilde(grid_);
    FFT                fft(dynamic_cast<GridCartesian *>(grid_));
    for(unsigned int mu = 0; mu < nd; mu++)
    {
      gaussian(rng, r);
      r = weight*r;
      pokeLorentz(aTilde, r, mu);
    }
    gaugeTransform(aTilde);
    fft.FFT_all_dim(out, aTilde, FFT::backward);
    out = real(out);
  }
  template<class GImpl>
  void Photon<GImpl>::UnitField(GaugeField &out)
  {
    const unsigned int nd = grid_->Nd();
    GaugeLinkField     r(grid_);
    r = ScalarComplex(1., 0.);
    for(unsigned int mu = 0; mu < nd; mu++)
    {
      pokeLorentz(out, r, mu);
    }
    out = real(out);
  }
 }}
 #endif
--- a/Grid/qcd/action/pseudofermion/Bounds.h
+++ b/Grid/qcd/action/pseudofermion/Bounds.h
@@ -1,53 +0,0 @@
 #pragma once
 namespace Grid{
  namespace QCD{
    template<class Field>
    void HighBoundCheck(LinearOperatorBase<Field> &HermOp, 
 			Field &Phi,
 			RealD hi)
    {
      // Eigenvalue bound check at high end
      PowerMethod<Field> power_method;
      auto lambda_max = power_method(HermOp,Phi);
      std::cout << GridLogMessage << "Pseudofermion action lamda_max "<<lambda_max<<"( bound "<<hi<<")"<<std::endl;
      assert( (lambda_max < hi) && " High Bounds Check on operator failed" );
    }
    template<class Field> void InverseSqrtBoundsCheck(int MaxIter,double tol,
 						       LinearOperatorBase<Field> &HermOp,
 						       Field &GaussNoise,
 						       MultiShiftFunction &PowerNegHalf) 
    {
      GridBase *FermionGrid = GaussNoise._grid;
      Field X(FermionGrid);
      Field Y(FermionGrid);
      Field Z(FermionGrid);
      X=GaussNoise;
      RealD Nx = norm2(X);
      ConjugateGradientMultiShift<Field> msCG(MaxIter,PowerNegHalf);
      msCG(HermOp,X,Y);
      msCG(HermOp,Y,Z);
      RealD Nz = norm2(Z);
      HermOp.HermOp(Z,Y);
      RealD Ny = norm2(Y);
      X=X-Y;
      RealD Nd = norm2(X);
      std::cout << "************************* "<<std::endl;
      std::cout << " noise                         = "<<Nx<<std::endl;
      std::cout << " (MdagM^-1/2)^2  noise         = "<<Nz<<std::endl;
      std::cout << " MdagM (MdagM^-1/2)^2  noise   = "<<Ny<<std::endl;
      std::cout << " noise - MdagM (MdagM^-1/2)^2  noise   = "<<Nd<<std::endl;
      std::cout << "************************* "<<std::endl;
      assert( (std::sqrt(Nd/Nx)<tol) && " InverseSqrtBoundsCheck ");
    }
  }
 }
--- a/Grid/qcd/action/scalar/ScalarInteractionAction.h
+++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h
@@ -1,208 +0,0 @@
 /*************************************************************************************
  Grid physics library, www.github.com/paboyle/Grid
  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
  Copyright (C) 2015
  Author: Guido Cossu <guido,cossu@ed.ac.uk>
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  See the full license in the file "LICENSE" in the top level distribution
 directory
  *************************************************************************************/
 /*  END LEGAL */
 #ifndef SCALAR_INT_ACTION_H
 #define SCALAR_INT_ACTION_H
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure
 namespace Grid
 {
 // FIXME drop the QCD namespace everywhere here
 template <class Impl, int Ndim>
 class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
 {
 public:
  INHERIT_FIELD_TYPES(Impl);
 private:
  RealD mass_square;
  RealD lambda;
  RealD g;
  const unsigned int N = Impl::Group::Dimension;
  typedef typename Field::vector_object vobj;
  typedef CartesianStencil<vobj, vobj> Stencil;
  SimpleCompressor<vobj> compressor;
  int npoint = 2 * Ndim;
  std::vector<int> directions;    //
  std::vector<int> displacements; //
 public:
  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
  {
    for (int mu = 0; mu < Ndim; mu++)
    {
      directions[mu] = mu;
      directions[mu + Ndim] = mu;
      displacements[mu] = 1;
      displacements[mu + Ndim] = -1;
    }
  }
  virtual std::string LogParameters()
  {
    std::stringstream sstream;
    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
    return sstream.str();
  }
  virtual std::string action_name() { return "ScalarAction"; }
  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
  virtual RealD S(const Field &p)
  {
    assert(p._grid->Nd() == Ndim);
    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
    phiStencil.HaloExchange(p, compressor);
    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
    phisquared = p * p;
    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
    for (int mu = 0; mu < Ndim; mu++)
    {
      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
      parallel_for(int i = 0; i < p._grid->oSites(); i++)
      {
        int permute_type;
        StencilEntry *SE;
        vobj temp2;
        const vobj *temp, *t_p;
        SE = phiStencil.GetEntry(permute_type, mu, i);
        t_p = &p._odata[i];
        if (SE->_is_local)
        {
          temp = &p._odata[SE->_offset];
          if (SE->_permute)
          {
            permute(temp2, *temp, permute_type);
            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
          }
          else
          {
            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
          }
        }
        else
        {
          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
        }
      }
      //  action -= pshift*p + p*pshift;
    }
    // NB the trace in the algebra is normalised to 1/2
    // minus sign coming from the antihermitian fields
    return -(TensorRemove(sum(trace(action)))).real() * N / g;
  };
  virtual void deriv(const Field &p, Field &force)
  {
    double t0 = usecond();
    assert(p._grid->Nd() == Ndim);
    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
    double interm_t = usecond();
    // move this outside
    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
    phiStencil.HaloExchange(p, compressor);
    double halo_t = usecond();
    int chunk = 128;
    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
    // inverting the order of the loops slows down the code(! g++ 7)
    // cannot try to reduce the number of  force writes by factor npoint...
    // use cache blocking
    for (int point = 0; point < npoint; point++)
    {
 #pragma omp parallel 
 {
        int permute_type;
        StencilEntry *SE;
        const vobj *temp;
 #pragma omp for schedule(static, chunk)
      for (int i = 0; i < p._grid->oSites(); i++)
      {
        SE = phiStencil.GetEntry(permute_type, point, i);
        // prefetch next p?
        if (SE->_is_local)
        {
          temp = &p._odata[SE->_offset];
          if (SE->_permute)
          {
            vobj temp2;
            permute(temp2, *temp, permute_type);
            force._odata[i] -= temp2;
          }
          else
          {
            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
          }
        }
        else
        {
          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
        }
      }
    }
  }
  force *= N / g;
  double t1 = usecond();
  double total_time = (t1 - t0) / 1e6;
  double interm_time = (interm_t - t0) / 1e6;
  double halo_time = (halo_t - interm_t) / 1e6;
  double stencil_time = (t1 - halo_t) / 1e6;
  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
  double Gflops = flops / (total_time * 1e9);
  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
 }
 };
 } // namespace Grid
 #endif // SCALAR_INT_ACTION_H
--- a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@@ -1,122 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
 Copyright (C) 2018
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef SCIDAC_CHECKPOINTER
 #define SCIDAC_CHECKPOINTER
 #ifdef HAVE_LIME
 #include <iostream>
 #include <sstream>
 #include <string>
 namespace Grid {
 namespace QCD {
 // For generic fields
 template <class Implementation, class Metadata>
 class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
 private:
  CheckpointerParameters Params;
  Metadata MData;
  typedef typename Implementation::Field Field;
 public:
  //INHERIT_GIMPL_TYPES(Implementation);
  ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
  ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
  void initialize(const CheckpointerParameters &Params_) {
    Params = Params_;
    // check here that the format is valid
    int ieee32big = (Params.format == std::string("IEEE32BIG"));
    int ieee32    = (Params.format == std::string("IEEE32"));
    int ieee64big = (Params.format == std::string("IEEE64BIG"));
    int ieee64    = (Params.format == std::string("IEEE64"));
    if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
      std::cout << GridLogError << "Unrecognized file format " << Params.format
                << std::endl;
      std::cout << GridLogError
                << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
                << std::endl;
      exit(1);
    }
  }
  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    if ((traj % Params.saveInterval) == 0) {
      std::string config, rng;
      this->build_filenames(traj, Params, config, rng);
      GridBase *grid = U._grid;
      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
      ScidacWriter _ScidacWriter(grid->IsBoss());
      _ScidacWriter.open(config);
      _ScidacWriter.writeScidacFieldRecord(U, MData);
      _ScidacWriter.close();
      std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
    }
  };
  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
                         GridParallelRNG &pRNG) {
    std::string config, rng;
    this->build_filenames(traj, Params, config, rng);
    this->check_filename(rng);
    this->check_filename(config);
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
    Metadata md_content;
    ScidacReader _ScidacReader;
    _ScidacReader.open(config);
    _ScidacReader.readScidacFieldRecord(U,md_content);  // format from the header
    _ScidacReader.close();
    std::cout << GridLogMessage << "Read Scidac Configuration from " << config
              << " checksum " << std::hex 
 	      << nersc_csum<<"/"
 	      << scidac_csuma<<"/"
 	      << scidac_csumb
 	      << std::dec << std::endl;
  };
 };
 }
 }
 #endif  // HAVE_LIME
 #endif  // ILDG_CHECKPOINTER
--- a/Grid/qcd/observables/polyakov_loop.h
+++ b/Grid/qcd/observables/polyakov_loop.h
@@ -1,68 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/modules/polyakov_line.h
 Copyright (C) 2017
 Author: David Preti <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef HMC_POLYAKOV_H
 #define HMC_POLYAKOV_H
 namespace Grid {
 namespace QCD {
 // this is only defined for a gauge theory
 template <class Impl>
 class PolyakovLogger : public HmcObservable<typename Impl::Field> {
 public:
  // here forces the Impl to be of gauge fields
  // if not the compiler will complain
  INHERIT_GIMPL_TYPES(Impl);
  // necessary for HmcObservable compatibility
  typedef typename Impl::Field Field;
  void TrajectoryComplete(int traj,
                          Field &U,
                          GridSerialRNG &sRNG,
                          GridParallelRNG &pRNG) {
    ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
    int def_prec = std::cout.precision();
    std::cout << GridLogMessage
        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
        << "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
    std::cout.precision(def_prec);
  }
 };
 }  // namespace QCD
 }  // namespace Grid
 #endif  // HMC_POLYAKOV_H
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
--- a/Grid/qcd/utils/CovariantSmearing.h
+++ b/Grid/qcd/utils/CovariantSmearing.h
@@ -1,87 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/scalar/CovariantLaplacian.h
 Copyright (C) 2016
 Author: Azusa Yamaguchi
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 #pragma once
 namespace Grid {
 namespace QCD {
 template <class Gimpl> class CovariantSmearing : public Gimpl 
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Gimpl::GaugeLinkField GaugeMat;
  typedef typename Gimpl::GaugeField GaugeLorentz;
  template<typename T>
  static void GaussianSmear(const std::vector<LatticeColourMatrix>& U, 
 			    T& chi, 
 			    const Real& width, int Iterations, int orthog)
  {
    GridBase *grid = chi._grid;
    T psi(grid);
    ////////////////////////////////////////////////////////////////////////////////////
    // Follow Chroma conventions for width to keep compatibility with previous data
    // Free field iterates 
    //   chi = (1 - w^2/4N p^2)^N chi
    //
    //       ~ (e^(-w^2/4N p^2)^N chi
    //       ~ (e^(-w^2/4 p^2) chi
    //       ~ (e^(-w'^2/2 p^2) chi          [ w' = w/sqrt(2) ]
    //
    // Which in coordinate space is proportional to
    //
    //   e^(-x^2/w^2) = e^(-x^2/2w'^2) 
    //
    // The 4 is a bit unconventional from Gaussian width perspective, but... it's Chroma convention.
    // 2nd derivative approx d^2/dx^2  =  x+mu + x-mu - 2x
    //
    // d^2/dx^2 = - p^2
    //
    // chi = ( 1 + w^2/4N d^2/dx^2 )^N chi
    //
    ////////////////////////////////////////////////////////////////////////////////////
    Real coeff = (width*width) / Real(4*Iterations);
    int dims = Nd;
    if( orthog < Nd ) dims=Nd-1;
    for(int n = 0; n < Iterations; ++n) {
      psi = (-2.0*dims)*chi;
      for(int mu=0;mu<Nd;mu++) {
 	if ( mu != orthog ) { 
 	  psi = psi + Gimpl::CovShiftForward(U[mu],mu,chi);    
 	  psi = psi + Gimpl::CovShiftBackward(U[mu],mu,chi);    
 	}
      }
      chi = chi + coeff*psi;
    }
  }
 };
 }}
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -1,660 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/serialisation/BaseIO.h
    Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_SERIALISATION_ABSTRACT_READER_H
 #define GRID_SERIALISATION_ABSTRACT_READER_H
 #include <type_traits>
 #include <Grid/tensors/Tensors.h>
 #include <Grid/serialisation/VectorUtils.h>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 namespace Grid {
  namespace EigenIO {
    // EigenIO works for scalars that are not just Grid supported scalars
    template<typename T, typename V = void> struct is_complex : public std::false_type {};
    // Support all complex types (not just Grid complex types) - even if the definitions overlap (!)
    template<typename T> struct is_complex<             T , typename
        std::enable_if< ::Grid::is_complex<             T >::value>::type> : public std::true_type {};
    template<typename T> struct is_complex<std::complex<T>, typename
        std::enable_if<!::Grid::is_complex<std::complex<T>>::value>::type> : public std::true_type {};
    // Helpers to support I/O for Eigen tensors of arithmetic scalars, complex types, or Grid tensors
    template<typename T, typename V = void> struct is_scalar : public std::false_type {};
    template<typename T> struct is_scalar<T, typename std::enable_if<std::is_arithmetic<T>::value || is_complex<T>::value>::type> : public std::true_type {};
    // Is this an Eigen tensor
    template<typename T> struct is_tensor : std::integral_constant<bool,
      std::is_base_of<Eigen::TensorBase<T, Eigen::ReadOnlyAccessors>, T>::value> {};
    // Is this an Eigen tensor of a supported scalar
    template<typename T, typename V = void> struct is_tensor_of_scalar : public std::false_type {};
    template<typename T> struct is_tensor_of_scalar<T, typename std::enable_if<is_tensor<T>::value && is_scalar<typename T::Scalar>::value>::type> : public std::true_type {};
    // Is this an Eigen tensor of a supported container
    template<typename T, typename V = void> struct is_tensor_of_container : public std::false_type {};
    template<typename T> struct is_tensor_of_container<T, typename std::enable_if<is_tensor<T>::value && isGridTensor<typename T::Scalar>::value>::type> : public std::true_type {};
    // These traits describe the scalars inside Eigen tensors
    // I wish I could define these in reference to the scalar type (so there would be fewer traits defined)
    // but I'm unable to find a syntax to make this work
    template<typename T, typename V = void> struct Traits {};
    // Traits are the default for scalars, or come from GridTypeMapper for GridTensors
    template<typename T> struct Traits<T, typename std::enable_if<is_tensor_of_scalar<T>::value>::type>
      : public GridTypeMapper_Base {
      using scalar_type   = typename T::Scalar; // ultimate base scalar
      static constexpr bool is_complex = ::Grid::EigenIO::is_complex<scalar_type>::value;
    };
    // Traits are the default for scalars, or come from GridTypeMapper for GridTensors
    template<typename T> struct Traits<T, typename std::enable_if<is_tensor_of_container<T>::value>::type> {
      using BaseTraits  = GridTypeMapper<typename T::Scalar>;
      using scalar_type = typename BaseTraits::scalar_type; // ultimate base scalar
      static constexpr bool   is_complex = ::Grid::EigenIO::is_complex<scalar_type>::value;
      static constexpr int   TensorLevel = BaseTraits::TensorLevel;
      static constexpr int          Rank = BaseTraits::Rank;
      static constexpr std::size_t count = BaseTraits::count;
      static constexpr int Dimension(int dim) { return BaseTraits::Dimension(dim); }
    };
    // Is this a fixed-size Eigen tensor
    template<typename T> struct is_tensor_fixed : public std::false_type {};
    template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
    struct is_tensor_fixed<Eigen::TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType>>
        : public std::true_type {};
    template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType,
              int MapOptions_, template <class> class MapPointer_>
    struct is_tensor_fixed<Eigen::TensorMap<Eigen::TensorFixedSize<Scalar_, Dimensions_,
                                            Options_, IndexType>, MapOptions_, MapPointer_>>
        : public std::true_type {};
    // Is this a variable-size Eigen tensor
    template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};
    template<typename T> struct is_tensor_variable<T, typename std::enable_if<is_tensor<T>::value
        && !is_tensor_fixed<T>::value>::type> : public std::true_type {};
  }
  // Abstract writer/reader classes ////////////////////////////////////////////
  // static polymorphism implemented using CRTP idiom
  class Serializable;
  // Static abstract writer
  template <typename T>
  class Writer
  {
  public:
    Writer(void);
    virtual ~Writer(void) = default;
    void push(const std::string &s);
    void pop(void);
    template <typename U>
    typename std::enable_if<std::is_base_of<Serializable, U>::value>::type
    write(const std::string& s, const U &output);
    template <typename U>
    typename std::enable_if<!std::is_base_of<Serializable, U>::value && !EigenIO::is_tensor<U>::value>::type
    write(const std::string& s, const U &output);
    template <typename U>
    void write(const std::string &s, const iScalar<U> &output);
    template <typename U, int N>
    void write(const std::string &s, const iVector<U, N> &output);
    template <typename U, int N>
    void write(const std::string &s, const iMatrix<U, N> &output);
    template <typename ETensor>
    typename std::enable_if<EigenIO::is_tensor<ETensor>::value>::type
    write(const std::string &s, const ETensor &output);
    // Helper functions for Scalar vs Container specialisations
    template <typename ETensor>
    inline typename std::enable_if<EigenIO::is_tensor_of_scalar<ETensor>::value,
    const typename ETensor::Scalar *>::type
    getFirstScalar(const ETensor &output)
    {
      return output.data();
    }
    template <typename ETensor>
    inline typename std::enable_if<EigenIO::is_tensor_of_container<ETensor>::value,
    const typename EigenIO::Traits<ETensor>::scalar_type *>::type
    getFirstScalar(const ETensor &output)
    {
      return output.data()->begin();
    }
    template <typename S>
    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
    copyScalars(S * &pCopy, const S &Source)
    {
      * pCopy ++ = Source;
    }
    template <typename S>
    inline typename std::enable_if<isGridTensor<S>::value, void>::type
    copyScalars(typename GridTypeMapper<S>::scalar_type * &pCopy, const S &Source)
    {
      for( const typename GridTypeMapper<S>::scalar_type &item : Source )
        * pCopy ++ = item;
    }
    void         scientificFormat(const bool set);
    bool         isScientific(void);
    void         setPrecision(const unsigned int prec);
    unsigned int getPrecision(void);
  private:
    T            *upcast;
    bool         scientific_{false};
    unsigned int prec_{0};
  };
  // Static abstract reader
  template <typename T>
  class Reader
  {
  public:
    Reader(void);
    virtual ~Reader(void) = default;
    bool push(const std::string &s);
    void pop(void);
    template <typename U>
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
    typename std::enable_if<!std::is_base_of<Serializable, U>::value
                         && !EigenIO::is_tensor<U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
    void read(const std::string &s, iScalar<U> &output);
    template <typename U, int N>
    void read(const std::string &s, iVector<U, N> &output);
    template <typename U, int N>
    void read(const std::string &s, iMatrix<U, N> &output);
    template <typename ETensor>
    typename std::enable_if<EigenIO::is_tensor<ETensor>::value, void>::type
    read(const std::string &s, ETensor &output);
    template <typename ETensor>
    typename std::enable_if<EigenIO::is_tensor_fixed<ETensor>::value, void>::type
    Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims );
    template <typename ETensor>
    typename std::enable_if<EigenIO::is_tensor_variable<ETensor>::value, void>::type
    Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims );
    // Helper functions for Scalar vs Container specialisations
    template <typename S>
    inline typename std::enable_if<EigenIO::is_scalar<S>::value, void>::type
    copyScalars(S &Dest, const S * &pSource)
    {
      Dest = * pSource ++;
    }
    template <typename S>
    inline typename std::enable_if<isGridTensor<S>::value, void>::type
    copyScalars(S &Dest, const typename GridTypeMapper<S>::scalar_type * &pSource)
    {
      for( typename GridTypeMapper<S>::scalar_type &item : Dest )
        item = * pSource ++;
    }
  protected:
    template <typename U>
    void fromString(U &output, const std::string &s);
  private:
    T *upcast;
  };
   // What is the vtype
  template<typename T> struct isReader {
    static const bool value = false;
  };
  template<typename T> struct isWriter {
    static const bool value = false;
  };
  // Writer template implementation
  template <typename T>
  Writer<T>::Writer(void)
  {
    upcast = static_cast<T *>(this);
  }
  template <typename T>
  void Writer<T>::push(const std::string &s)
  {
    upcast->push(s);
  }
  template <typename T>
  void Writer<T>::pop(void)
  {
    upcast->pop();
  }
  template <typename T>
  template <typename U>
  typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    U::write(*this, s, output);
  }
  template <typename T>
  template <typename U>
  typename std::enable_if<!std::is_base_of<Serializable, U>::value
                       && !EigenIO::is_tensor<U>::value, void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    upcast->writeDefault(s, output);
  }
  template <typename T>
  template <typename U>
  void Writer<T>::write(const std::string &s, const iScalar<U> &output)
  {
    upcast->writeDefault(s, tensorToVec(output));
  }
  template <typename T>
  template <typename U, int N>
  void Writer<T>::write(const std::string &s, const iVector<U, N> &output)
  {
    upcast->writeDefault(s, tensorToVec(output));
  }
  template <typename T>
  template <typename U, int N>
  void Writer<T>::write(const std::string &s, const iMatrix<U, N> &output)
  {
    upcast->writeDefault(s, tensorToVec(output));
  }
  // Eigen::Tensors of Grid tensors (iScalar, iVector, iMatrix)
  template <typename T>
  template <typename ETensor>
  typename std::enable_if<EigenIO::is_tensor<ETensor>::value, void>::type
  Writer<T>::write(const std::string &s, const ETensor &output)
  {
    using Index = typename ETensor::Index;
    using Container = typename ETensor::Scalar; // NB: could be same as scalar
    using Traits = EigenIO::Traits<ETensor>;
    using Scalar = typename Traits::scalar_type; // type of the underlying scalar
    constexpr unsigned int TensorRank{ETensor::NumIndices};
    constexpr unsigned int ContainerRank{Traits::Rank}; // Only non-zero for containers
    constexpr unsigned int TotalRank{TensorRank + ContainerRank};
    const Index NumElements{output.size()};
    assert( NumElements > 0 );
    // Get the dimensionality of the tensor
    std::vector<std::size_t>  TotalDims(TotalRank);
    for(auto i = 0; i < TensorRank; i++ ) {
      auto dim = output.dimension(i);
      TotalDims[i] = static_cast<size_t>(dim);
      assert( TotalDims[i] == dim ); // check we didn't lose anything in the conversion
    }
    for(auto i = 0; i < ContainerRank; i++ )
      TotalDims[TensorRank + i] = Traits::Dimension(i);
    // If the Tensor isn't in Row-Major order, then we'll need to copy it's data
    const bool CopyData{NumElements > 1 && ETensor::Layout != Eigen::StorageOptions::RowMajor};
    const Scalar * pWriteBuffer;
    std::vector<Scalar> CopyBuffer;
    const Index TotalNumElements = NumElements * Traits::count;
    if( !CopyData ) {
      pWriteBuffer = getFirstScalar( output );
    } else {
      // Regardless of the Eigen::Tensor storage order, the copy will be Row Major
      CopyBuffer.resize( TotalNumElements );
      Scalar * pCopy = &CopyBuffer[0];
      pWriteBuffer = pCopy;
      std::array<Index, TensorRank> MyIndex;
      for( auto &idx : MyIndex ) idx = 0;
      for( auto n = 0; n < NumElements; n++ ) {
        const Container & c = output( MyIndex );
        copyScalars( pCopy, c );
        // Now increment the index
        for( int i = output.NumDimensions - 1; i >= 0 && ++MyIndex[i] == output.dimension(i); i-- )
          MyIndex[i] = 0;
      }
    }
    upcast->template writeMultiDim<Scalar>(s, TotalDims, pWriteBuffer, TotalNumElements);
  }
  template <typename T>
  void Writer<T>::scientificFormat(const bool set)
  {
    scientific_ = set;
  }
  template <typename T>
  bool Writer<T>::isScientific(void)
  {
    return scientific_;
  }
  template <typename T>
  void Writer<T>::setPrecision(const unsigned int prec)
  {
    prec_ = prec;
  }
  template <typename T>
  unsigned int Writer<T>::getPrecision(void)
  {
    return prec_;
  }
  // Reader template implementation
  template <typename T>
  Reader<T>::Reader(void)
  {
    upcast = static_cast<T *>(this);
  }
  template <typename T>
  bool Reader<T>::push(const std::string &s)
  {
    return upcast->push(s);
  }
  template <typename T>
  void Reader<T>::pop(void)
  {
    upcast->pop();
  }
  template <typename T>
  template <typename U>
  typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    U::read(*this, s, output);
  }
  template <typename T>
  template <typename U>
  typename std::enable_if<!std::is_base_of<Serializable, U>::value
                       && !EigenIO::is_tensor<U>::value, void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    upcast->readDefault(s, output);
  }
  template <typename T>
  template <typename U>
  void Reader<T>::read(const std::string &s, iScalar<U> &output)
  {
    typename TensorToVec<iScalar<U>>::type v;
    upcast->readDefault(s, v);
    vecToTensor(output, v);
  }
  template <typename T>
  template <typename U, int N>
  void Reader<T>::read(const std::string &s, iVector<U, N> &output)
  {
    typename TensorToVec<iVector<U, N>>::type v;
    upcast->readDefault(s, v);
    vecToTensor(output, v);
  }
  template <typename T>
  template <typename U, int N>
  void Reader<T>::read(const std::string &s, iMatrix<U, N> &output)
  {
    typename TensorToVec<iMatrix<U, N>>::type v;
    upcast->readDefault(s, v);
    vecToTensor(output, v);
  }
  template <typename T>
  template <typename ETensor>
  typename std::enable_if<EigenIO::is_tensor<ETensor>::value, void>::type
  Reader<T>::read(const std::string &s, ETensor &output)
  {
    using Index = typename ETensor::Index;
    using Container = typename ETensor::Scalar; // NB: could be same as scalar
    using Traits = EigenIO::Traits<ETensor>;
    using Scalar = typename Traits::scalar_type; // type of the underlying scalar
    constexpr unsigned int TensorRank{ETensor::NumIndices};
    constexpr unsigned int ContainerRank{Traits::Rank}; // Only non-zero for containers
    constexpr unsigned int TotalRank{TensorRank + ContainerRank};
    using ETDims = std::array<Index, TensorRank>; // Dimensions of the tensor
    // read the (flat) data and dimensionality
    std::vector<std::size_t> dimData;
    std::vector<Scalar> buf;
    upcast->readMultiDim( s, buf, dimData );
    assert(dimData.size() == TotalRank && "EigenIO: Tensor rank mismatch" );
    // Make sure that the number of elements read matches dimensions read
    std::size_t NumContainers = 1;
    for( auto i = 0 ; i < TensorRank ; i++ )
      NumContainers *= dimData[i];
    // If our scalar object is a Container, make sure it's dimensions match what we read back
    std::size_t ElementsPerContainer = 1;
    for( auto i = 0 ; i < ContainerRank ; i++ ) {
      assert( dimData[TensorRank+i] == Traits::Dimension(i) && "Tensor Container dimensions don't match data" );
      ElementsPerContainer *= dimData[TensorRank+i];
    }
    assert( NumContainers * ElementsPerContainer == buf.size() && "EigenIO: Number of elements != product of dimensions" );
    // Now see whether the tensor is the right shape, or can be made to be
    const auto & dims = output.dimensions();
    bool bShapeOK = (output.data() != nullptr);
    for( auto i = 0; bShapeOK && i < TensorRank ; i++ )
      if( dims[i] != dimData[i] )
        bShapeOK = false;
    // Make the tensor the same size as the data read
    ETDims MyIndex;
    if( !bShapeOK ) {
      for( auto i = 0 ; i < TensorRank ; i++ )
        MyIndex[i] = dimData[i];
      Reshape(output, MyIndex);
    }
    // Copy the data into the tensor
    for( auto &d : MyIndex ) d = 0;
    const Scalar * pSource = &buf[0];
    for( std::size_t n = 0 ; n < NumContainers ; n++ ) {
      Container & c = output( MyIndex );
      copyScalars( c, pSource );
      // Now increment the index
      for( int i = TensorRank - 1; i != -1 && ++MyIndex[i] == dims[i]; i-- )
        MyIndex[i] = 0;
    }
    assert( pSource == &buf[NumContainers * ElementsPerContainer] );
  }
  template <typename T>
  template <typename ETensor>
  typename std::enable_if<EigenIO::is_tensor_fixed<ETensor>::value, void>::type
  Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
  {
    assert( 0 && "EigenIO: Fixed tensor dimensions can't be changed" );
  }
  template <typename T>
  template <typename ETensor>
  typename std::enable_if<EigenIO::is_tensor_variable<ETensor>::value, void>::type
  Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
  {
    //t.reshape( dims );
    t.resize( dims );
  }
  template <typename T>
  template <typename U>
  void Reader<T>::fromString(U &output, const std::string &s)
  {
    std::istringstream is(s);
    is.exceptions(std::ios::failbit);
    try
    {
      is >> std::boolalpha >> output;
    }
    catch(std::ios_base::failure &e)
    {
      std::cerr << "numerical conversion failure on '" << s << "' ";
      std::cerr << "(typeid: " << typeid(U).name() << ")" << std::endl;
      abort();
    }
  }
  // serializable base class ///////////////////////////////////////////////////
  class Serializable
  {
  public:
    template <typename T>
    static inline void write(Writer<T> &WR,const std::string &s,
                             const Serializable &obj)
    {}
    template <typename T>
    static inline void read(Reader<T> &RD,const std::string &s,
                            Serializable &obj)
    {}
    friend inline std::ostream & operator<<(std::ostream &os,
                                            const Serializable &obj)
    {
      return os;
    }
    template <typename T1, typename T2>
    static inline typename std::enable_if<!EigenIO::is_tensor<T1>::value || !EigenIO::is_tensor<T2>::value, bool>::type
    CompareMember(const T1 &lhs, const T2 &rhs) {
      return lhs == rhs;
    }
    template <typename T1, typename T2>
    static inline typename std::enable_if<EigenIO::is_tensor<T1>::value && EigenIO::is_tensor<T2>::value, bool>::type
    CompareMember(const T1 &lhs, const T2 &rhs) {
      // First check whether dimensions match (Eigen tensor library will assert if they don't match)
      bool bReturnValue = (T1::NumIndices == T2::NumIndices);
      for( auto i = 0 ; bReturnValue && i < T1::NumIndices ; i++ )
          bReturnValue = ( lhs.dimension(i) == rhs.dimension(i) );
      if( bReturnValue ) {
        Eigen::Tensor<bool, 0, T1::Options> bResult = (lhs == rhs).all();
        bReturnValue = bResult(0);
      }
      return bReturnValue;
    }
    template <typename T>
    static inline typename std::enable_if<EigenIO::is_tensor<T>::value, bool>::type
    CompareMember(const std::vector<T> &lhs, const std::vector<T> &rhs) {
      const auto NumElements = lhs.size();
      bool bResult = ( NumElements == rhs.size() );
      for( auto i = 0 ; i < NumElements && bResult ; i++ )
        bResult = CompareMember(lhs[i], rhs[i]);
      return bResult;
    }
    template <typename T>
    static inline typename std::enable_if<!EigenIO::is_tensor<T>::value, void>::type
    WriteMember(std::ostream &os, const T &object) {
      os << object;
    }
    template <typename T>
    static inline typename std::enable_if<EigenIO::is_tensor<T>::value, void>::type
    WriteMember(std::ostream &os, const T &object) {
      using Index = typename T::Index;
      const Index NumElements{object.size()};
      assert( NumElements > 0 );
      Index count = 1;
      os << "T<";
      for( int i = 0; i < T::NumIndices; i++ ) {
        Index dim = object.dimension(i);
        count *= dim;
        if( i )
          os << ",";
        os << dim;
      }
      assert( count == NumElements && "Number of elements doesn't match tensor dimensions" );
      os << ">{";
      const typename T::Scalar * p = object.data();
      for( Index i = 0; i < count; i++ ) {
        if( i )
          os << ",";
        os << *p++;
      }
      os << "}";
    }
  };
  // Generic writer interface //////////////////////////////////////////////////
  template <typename T>
  inline void push(Writer<T> &w, const std::string &s) {
    w.push(s);
  }
  template <typename T>
  inline void push(Writer<T> &w, const char *s)
  {
    w.push(std::string(s));
  }
  template <typename T>
  inline void pop(Writer<T> &w)
  {
    w.pop();
  }
  template <typename T, typename U>
  inline void write(Writer<T> &w, const std::string& s, const U &output)
  {
    w.write(s, output);
  }
  // Generic reader interface //////////////////////////////////////////////////
  template <typename T>
  inline bool push(Reader<T> &r, const std::string &s)
  {
    return r.push(s);
  }
  template <typename T>
  inline bool push(Reader<T> &r, const char *s)
  {
    return r.push(std::string(s));
  }
  template <typename T>
  inline void pop(Reader<T> &r)
  {
    r.pop();
  }
  template <typename T, typename U>
  inline void read(Reader<T> &r, const std::string &s, U &output)
  {
    r.read(s, output);
  }
 }
 #endif
--- a/Grid/serialisation/VectorUtils.h
+++ b/Grid/serialisation/VectorUtils.h
@@ -1,479 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./Grid/serialisation/VectorUtils.h
 Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_SERIALISATION_VECTORUTILS_H
 #define GRID_SERIALISATION_VECTORUTILS_H
 #include <type_traits>
 #include <Grid/tensors/Tensors.h>
 namespace Grid {
  // Pair IO utilities /////////////////////////////////////////////////////////
  // helper function to parse input in the format "<obj1 obj2>"
  template <typename T1, typename T2>
  inline std::istream & operator>>(std::istream &is, std::pair<T1, T2> &buf)
  {
    T1 buf1;
    T2 buf2;
    char c;
    // Search for "pair" delimiters.
    do
    {
      is.get(c);
    } while (c != '(' && !is.eof());
    if (c == '(')
    {
      int start = is.tellg();
      do
      {
        is.get(c);
      } while (c != ')' && !is.eof());
      if (c == ')')
      {
        int end = is.tellg();
        int psize = end - start - 1;
        // Only read data between pair limiters.
        is.seekg(start);
        std::string tmpstr(psize, ' ');
        is.read(&tmpstr[0], psize);
        std::istringstream temp(tmpstr);
        temp >> buf1 >> buf2;
        buf = std::make_pair(buf1, buf2);
        is.seekg(end);
      }
    }
    is.peek();
    return is;
  }
  // output to streams for pairs
  template <class T1, class T2>
  inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
  {
    os << "(" << p.first << " " << p.second << ")";
    return os;
  }
  // std::vector<std:vector<...>> nested to specified Rank //////////////////////////////////
  template<typename T, unsigned int Rank>
  struct NestedStdVector {
    typedef typename std::vector<typename NestedStdVector<T, Rank - 1>::type> type;
  };
  template<typename T>
  struct NestedStdVector<T,0> {
    typedef T type;
  };
  // Grid scalar tensors to nested std::vectors //////////////////////////////////
  template <typename T>
  struct TensorToVec
  {
    typedef T type;
  };
  template <typename T>
  struct TensorToVec<iScalar<T>>
  {
    typedef typename TensorToVec<T>::type type;
  };
  template <typename T, int N>
  struct TensorToVec<iVector<T, N>>
  {
    typedef typename std::vector<typename TensorToVec<T>::type> type;
  };
  template <typename T, int N>
  struct TensorToVec<iMatrix<T, N>>
  {
    typedef typename std::vector<std::vector<typename TensorToVec<T>::type>> type;
  };
  template <typename T>
  void tensorDim(std::vector<size_t> &dim, const T &t, const bool wipe = true)
  {
    if (wipe)
    {
      dim.clear();
    }
  }
  template <typename T>
  void tensorDim(std::vector<size_t> &dim, const iScalar<T> &t, const bool wipe = true)
  {
    if (wipe)
    {
      dim.clear();
    }
    tensorDim(dim, t._internal, false);
  }
  template <typename T, int N>
  void tensorDim(std::vector<size_t> &dim, const iVector<T, N> &t, const bool wipe = true)
  {
    if (wipe)
    {
      dim.clear();
    }
    dim.push_back(N);
    tensorDim(dim, t._internal[0], false);
  }
  template <typename T, int N>
  void tensorDim(std::vector<size_t> &dim, const iMatrix<T, N> &t, const bool wipe = true)
  {
    if (wipe)
    {
      dim.clear();
    }
    dim.push_back(N);
    dim.push_back(N);
    tensorDim(dim, t._internal[0][0], false);
  }
  template <typename T>
  typename TensorToVec<T>::type tensorToVec(const T &t)
  {
    return t;
  }
  template <typename T>
  typename TensorToVec<iScalar<T>>::type tensorToVec(const iScalar<T>& t)
  {
    return tensorToVec(t._internal);
  }
  template <typename T, int N>
  typename TensorToVec<iVector<T, N>>::type tensorToVec(const iVector<T, N>& t)
  {
    typename TensorToVec<iVector<T, N>>::type v;
    v.resize(N);
    for (unsigned int i = 0; i < N; i++) 
    {
      v[i] = tensorToVec(t._internal[i]);
    }
    return v;
  }
  template <typename T, int N>
  typename TensorToVec<iMatrix<T, N>>::type tensorToVec(const iMatrix<T, N>& t)
  {
    typename TensorToVec<iMatrix<T, N>>::type v;
    v.resize(N);
    for (unsigned int i = 0; i < N; i++)
    {
      v[i].resize(N);
      for (unsigned int j = 0; j < N; j++) 
      {
        v[i][j] = tensorToVec(t._internal[i][j]);
      }
    }
    return v;
  }
  template <typename T>
  void vecToTensor(T &t, const typename TensorToVec<T>::type &v)
  {
    t = v;
  }
  template <typename T>
  void vecToTensor(iScalar<T> &t, const typename TensorToVec<iScalar<T>>::type &v)
  {
    vecToTensor(t._internal, v);
  }
  template <typename T, int N>
  void vecToTensor(iVector<T, N> &t, const typename TensorToVec<iVector<T, N>>::type &v)
  {
    for (unsigned int i = 0; i < N; i++) 
    {
      vecToTensor(t._internal[i], v[i]);
    }
  }
  template <typename T, int N>
  void vecToTensor(iMatrix<T, N> &t, const typename TensorToVec<iMatrix<T, N>>::type &v)
  {
    for (unsigned int i = 0; i < N; i++)
    for (unsigned int j = 0; j < N; j++)
    {
      vecToTensor(t._internal[i][j], v[i][j]);
    }
  }
  // Vector element trait //////////////////////////////////////////////////////  
  template <typename T>
  struct element
  {
    typedef T type;
    static constexpr bool is_number = false;
  };
  template <typename T>
  struct element<std::vector<T>>
  {
    typedef typename element<T>::type type;
    static constexpr bool is_number = std::is_arithmetic<T>::value
                                      or is_complex<T>::value
                                      or element<T>::is_number;
  };
  // Vector flattening utility class ////////////////////////////////////////////
  // Class to flatten a multidimensional std::vector
  template <typename V>
  class Flatten
  {
  public:
    typedef typename element<V>::type Element;
  public:
    explicit                     Flatten(const V &vector);
    const V &                    getVector(void);
    const std::vector<Element> & getFlatVector(void);
    const std::vector<size_t>  & getDim(void);
  private:
    void accumulate(const Element &e);
    template <typename W>
    void accumulate(const W &v);
    void accumulateDim(const Element &e);
    template <typename W>
    void accumulateDim(const W &v);
  private:
    const V              &vector_;
    std::vector<Element> flatVector_;
    std::vector<size_t>  dim_;
  };
  // Class to reconstruct a multidimensional std::vector
  template <typename V>
  class Reconstruct
  {
  public:
    typedef typename element<V>::type Element;
  public:
    Reconstruct(const std::vector<Element> &flatVector,
                const std::vector<size_t> &dim);
    const V &                    getVector(void);
    const std::vector<Element> & getFlatVector(void);
    const std::vector<size_t>  & getDim(void);
  private:
    void fill(std::vector<Element> &v);
    template <typename W>
    void fill(W &v);
    void resize(std::vector<Element> &v, const unsigned int dim);
    template <typename W>
    void resize(W &v, const unsigned int dim);
  private:
    V                          vector_;
    const std::vector<Element> &flatVector_;
    std::vector<size_t>        dim_;
    size_t                     ind_{0};
    unsigned int               dimInd_{0};
  };
  // Flatten class template implementation
  template <typename V>
  void Flatten<V>::accumulate(const Element &e)
  {
    flatVector_.push_back(e);
  }
  template <typename V>
  template <typename W>
  void Flatten<V>::accumulate(const W &v)
  {
    for (auto &e: v)
    {
      accumulate(e);
    }
  }
  template <typename V>
  void Flatten<V>::accumulateDim(const Element &e) {};
  template <typename V>
  template <typename W>
  void Flatten<V>::accumulateDim(const W &v)
  {
    dim_.push_back(v.size());
    accumulateDim(v[0]);
  }
  template <typename V>
  Flatten<V>::Flatten(const V &vector)
  : vector_(vector)
  {
    accumulate(vector_);
    accumulateDim(vector_);
  }
  template <typename V>
  const V & Flatten<V>::getVector(void)
  {
    return vector_;
  }
  template <typename V>
  const std::vector<typename Flatten<V>::Element> &
  Flatten<V>::getFlatVector(void)
  {
    return flatVector_;
  }
  template <typename V>
  const std::vector<size_t> & Flatten<V>::getDim(void)
  {
    return dim_;
  }
  // Reconstruct class template implementation
  template <typename V>
  void Reconstruct<V>::fill(std::vector<Element> &v)
  {
    for (auto &e: v)
    {
      e = flatVector_[ind_++];
    }
  }
  template <typename V>
  template <typename W>
  void Reconstruct<V>::fill(W &v)
  {
    for (auto &e: v)
    {
      fill(e);
    }
  }
  template <typename V>
  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
  {
    v.resize(dim_[dim]);
  }
  template <typename V>
  template <typename W>
  void Reconstruct<V>::resize(W &v, const unsigned int dim)
  {
    v.resize(dim_[dim]);
    for (auto &e: v)
    {
      resize(e, dim + 1);
    }
  }
  template <typename V>
  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
                              const std::vector<size_t> &dim)
  : flatVector_(flatVector)
  , dim_(dim)
  {
    resize(vector_, 0);
    fill(vector_);
  }
  template <typename V>
  const V & Reconstruct<V>::getVector(void)
  {
    return vector_;
  }
  template <typename V>
  const std::vector<typename Reconstruct<V>::Element> &
  Reconstruct<V>::getFlatVector(void)
  {
    return flatVector_;
  }
  template <typename V>
  const std::vector<size_t> & Reconstruct<V>::getDim(void)
  {
    return dim_;
  }
  // Vector IO utilities ///////////////////////////////////////////////////////
  // helper function to read space-separated values
  template <typename T>
  std::vector<T> strToVec(const std::string s)
  {
    std::istringstream sstr(s);
    T                  buf;
    std::vector<T>     v;
    while(!sstr.eof())
    {
      sstr >> buf;
      v.push_back(buf);
    }
    return v;
  }
  // output to streams for vectors
  template < class T >
  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
  {
    os << "[";
    for (unsigned int i = 0; i < v.size(); ++i)
    {
      os << v[i];
      if (i < v.size() - 1)
      {
        os << " ";
      }
    }
    os << "]";
    return os;
  }
 }
 // helper function to read space-separated values
 template <typename T>
 std::string vecToStr(const std::vector<T> &v)
 {
  using Grid::operator<<;
  std::ostringstream sstr;
  sstr << v;
  return sstr.str();
 }
 #endif
--- a/Grid/serialisation/XmlIO.cc
+++ b/Grid/serialisation/XmlIO.cc
@@ -1,188 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/serialisation/XmlIO.cc
    Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 using namespace Grid;
 void Grid::xmlCheckParse(const pugi::xml_parse_result &result, const std::string name)
 {
  if (!result) 
  {
    std::cerr << "XML parsing error for " << name << std::endl;
    std::cerr << "XML error description: " << result.description() << std::endl;
    std::cerr << "XML error offset     : " << result.offset << std::endl;
    abort();
  }
 }
 // Writer implementation ///////////////////////////////////////////////////////
 XmlWriter::XmlWriter(const std::string &fileName, std::string toplev) : fileName_(fileName)
 {
  if ( toplev == std::string("") ) {
    node_=doc_;
  } else { 
    node_=doc_.append_child();
    node_.set_name(toplev.c_str());
  }
 }
 XmlWriter::~XmlWriter(void)
 {
  if ( fileName_ != std::string("") ) { 
    doc_.save_file(fileName_.c_str(), indent_.c_str());
  }
 }
 void XmlWriter::push(const std::string &s)
 {
  node_ = node_.append_child(s.c_str());
 }
 void XmlWriter::pushXmlString(const std::string &s)
 {
  pugi::xml_document doc;
  auto               result = doc.load_buffer(s.c_str(), s.size());
  xmlCheckParse(result, "fragment\n'" + s +"'");
  for (pugi::xml_node child = doc.first_child(); child; child = child.next_sibling())
  {
      node_ = node_.append_copy(child);
  }
  pop();
 }
 void XmlWriter::pop(void)
 {
  node_ = node_.parent();
 }
 std::string XmlWriter::docString(void)
 {
  std::ostringstream oss; 
  doc_.save(oss, indent_.c_str());
  return oss.str();
 }
 std::string XmlWriter::string(void)
 {
  std::ostringstream oss; 
  doc_.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
  return oss.str();
 }
 // Reader implementation ///////////////////////////////////////////////////////
 XmlReader::XmlReader(const std::string &s,  const bool isBuffer, 
                     std::string toplev) 
 {
  pugi::xml_parse_result result;
  if (isBuffer)
  {
    fileName_ = "<string>";
    result    = doc_.load_string(s.c_str());
    xmlCheckParse(result, "string\n'" + s + "'");
  }
  else
  {
    fileName_ = s;
    result    = doc_.load_file(s.c_str());
    xmlCheckParse(result, "file '" + fileName_ + "'");
  }
  if ( toplev == std::string("") ) {
  node_ = doc_;
  } else { 
    node_ = doc_.child(toplev.c_str());
  }
 }
 #define XML_SAFE_NODE(expr)\
 if (expr)\
 {\
  node_ = expr;\
  return true;\
 }\
 else\
 {\
  return false;\
 }
 bool XmlReader::push(const std::string &s)
 {
  if (s.empty())
  {
    XML_SAFE_NODE(node_.first_child());
  }
  else
  {
    XML_SAFE_NODE(node_.child(s.c_str()));
  }
 }
 void XmlReader::pop(void)
 {
  node_ = node_.parent();
 }
 bool XmlReader::nextElement(const std::string &s)
 {
  if (s.empty())
  {
    XML_SAFE_NODE(node_.next_sibling());
  }
  else
  {
    XML_SAFE_NODE(node_.next_sibling(s.c_str()));
  }
 }
 void XmlReader::readCurrentSubtree(std::string &s)
 {
  std::ostringstream oss; 
  pugi::xml_document doc;
  doc.append_copy(node_);
  doc.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
  s = oss.str();
 }
 template <>
 void XmlReader::readDefault(const std::string &s, std::string &output)
 {
  if (node_.child(s.c_str()))
  {
    output = node_.child(s.c_str()).first_child().value();
  }
  else
  {
    std::cout << GridLogWarning << "XML: cannot open node '" << s << "'";
    std::cout << std::endl;
    output = ""; 
  }
 }
--- a/Grid/serialisation/XmlIO.h
+++ b/Grid/serialisation/XmlIO.h
@@ -1,246 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/serialisation/XmlIO.h
    Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_SERIALISATION_XML_READER_H
 #define GRID_SERIALISATION_XML_READER_H
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <sstream>
 #include <math.h>
 #include <vector>
 #include <cassert>
 #include <Grid/pugixml/pugixml.h>
 #include <Grid/GridCore.h>
 namespace Grid
 {
  void xmlCheckParse(const pugi::xml_parse_result &result, const std::string name);
  class XmlWriter: public Writer<XmlWriter>
  {    
  public:
    XmlWriter(const std::string &fileName, std::string toplev = std::string("grid") );
    virtual ~XmlWriter(void);
    void push(const std::string &s);
    void pushXmlString(const std::string &s);
    void pop(void);
    template <typename U>
    void writeDefault(const std::string &s, const U &x);
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
    template <typename U>
    void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
    std::string docString(void);
    std::string string(void);
  private:
    const std::string  indent_{"  "};
    pugi::xml_document doc_;
    pugi::xml_node     node_;
    std::string        fileName_;
  };
  class XmlReader: public Reader<XmlReader>
  {
  public:
    XmlReader(const std::string &fileName, const bool isBuffer = false, 
              std::string toplev = std::string("grid") );
    virtual ~XmlReader(void) = default;
    bool push(const std::string &s = "");
    void pop(void);
    bool nextElement(const std::string &s = "");
    template <typename U>
    void readDefault(const std::string &s, U &output);
    template <typename U>
    void readDefault(const std::string &s, std::vector<U> &output);
    template <typename U>
    void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
    void readCurrentSubtree(std::string &s);
  private:
    void checkParse(const pugi::xml_parse_result &result, const std::string name);
  private:
    const std::string  indent_{"  "};
    pugi::xml_document doc_;
    pugi::xml_node     node_;
    std::string        fileName_;
  };
  template <>
  struct isReader< XmlReader > {
    static const bool value = true;
  };
  template <>
  struct isWriter<XmlWriter > {
    static const bool value = true;
  };
  // Writer template implementation ////////////////////////////////////////////
  template <typename U>
  void XmlWriter::writeDefault(const std::string &s, const U &x)
  {
    std::ostringstream os;
    if (getPrecision())
    {
      os.precision(getPrecision());
    }
    if (isScientific())
    {
      os << std::scientific;
    }
    os << std::boolalpha << x;
    pugi::xml_node leaf = node_.append_child(s.c_str());
    leaf.append_child(pugi::node_pcdata).set_value(os.str().c_str());
  }
  template <typename U>
  void XmlWriter::writeDefault(const std::string &s, const std::vector<U> &x)
  {
    push(s);
    for( auto &u : x )
    {
      write("elem", u);
    }
    pop();
  }
  template <typename U>
  void XmlWriter::writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements)
  {
    push(s);
    size_t count = 1;
    const int Rank = static_cast<int>( Dimensions.size() );
    write("rank", Rank );
    std::vector<size_t> MyIndex( Rank );
    for( auto d : Dimensions ) {
      write("dim", d);
      count *= d;
    }
    assert( count == NumElements && "XmlIO : element count doesn't match dimensions" );
    static const char sName[] = "tensor";
    for( int i = 0 ; i < Rank ; i++ ) {
      MyIndex[i] = 0;
      push(sName);
    }
    while (NumElements--) {
      write("elem", *pDataRowMajor++);
      int i;
      for( i = Rank - 1 ; i != -1 && ++MyIndex[i] == Dimensions[i] ; i-- )
        MyIndex[i] = 0;
      int Rollover = Rank - 1 - i;
      for( i = 0 ; i < Rollover ; i++ )
        pop();
      for( i = 0 ; NumElements && i < Rollover ; i++ )
        push(sName);
    }
    pop();
  }
  // Reader template implementation ////////////////////////////////////////////
  template <typename U>
  void XmlReader::readDefault(const std::string &s, U &output)
  {
    std::string buf;
    readDefault(s, buf);
    fromString(output, buf);
  }
  template <>
  void XmlReader::readDefault(const std::string &s, std::string &output);
  template <typename U>
  void XmlReader::readDefault(const std::string &s, std::vector<U> &output)
  {
    if (!push(s))
    {
      std::cout << GridLogWarning << "XML: cannot open node '" << s << "'";
      std::cout << std::endl;
    } else {
      for(unsigned int i = 0; node_.child("elem"); )
      {
        output.resize(i + 1);
        read("elem", output[i++]);
        node_.child("elem").set_name("elem-done");
      }
      pop();
    }
  }
  template <typename U>
  void XmlReader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
  {
    if (!push(s))
    {
      std::cout << GridLogWarning << "XML: cannot open node '" << s << "'";
      std::cout << std::endl;
    } else {
      static const char sName[] = "tensor";
      static const char sNameDone[] = "tensor-done";
      int Rank;
      read("rank", Rank);
      dim.resize( Rank );
      size_t NumElements = 1;
      for( auto &d : dim )
      {
        read("dim", d);
        node_.child("dim").set_name("dim-done");
        NumElements *= d;
      }
      buf.resize( NumElements );
      std::vector<size_t> MyIndex( Rank );
      for( int i = 0 ; i < Rank ; i++ ) {
        MyIndex[i] = 0;
        push(sName);
      }
      for( auto &x : buf )
      {
        NumElements--;
        read("elem", x);
        node_.child("elem").set_name("elem-done");
        int i;
        for( i = Rank - 1 ; i != -1 && ++MyIndex[i] == dim[i] ; i-- )
          MyIndex[i] = 0;
        int Rollover = Rank - 1 - i;
        for( i = 0 ; i < Rollover ; i++ ) {
          node_.set_name(sNameDone);
          pop();
        }
        for( i = 0 ; NumElements && i < Rollover ; i++ )
          push(sName);
      }
      pop();
    }
  }
 }
 #endif
--- a/Grid/util/Profiling.h
+++ b/Grid/util/Profiling.h
@@ -1,72 +0,0 @@
   /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/util/Profiling.h
    Copyright (C) 2018
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_PERF_PROFILING_H
 #define GRID_PERF_PROFILING_H
 #include <sstream>
 #include <iostream>
 #include <functional>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <signal.h>
 struct System
 {
    static void profile(const std::string& name,std::function<void()> body) {
        std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name;
        // Launch profiler
        pid_t pid;
        std::stringstream s;
        s << getpid();
        pid = fork();
        if (pid == 0) {
            auto fd=open("/dev/null",O_RDWR);
            dup2(fd,1);
            dup2(fd,2);
            exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr));
        }
        // Run body
        body();
        // Kill profiler  
        kill(pid,SIGINT);
        waitpid(pid,nullptr,0);
    }
    static void profile(std::function<void()> body) {
        profile("perf.data",body);
    }
 };
 #endif // GRID_PERF_PROFILING_H
--- a/Grid/util/Sha.h
+++ b/Grid/util/Sha.h
@@ -1,99 +0,0 @@
   /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/util/Sha.h
    Copyright (C) 2018
    Author: Peter Boyle
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 extern "C" {
 #include <openssl/sha.h>
 }
 #ifdef USE_IPP
 #include "ipp.h"
 #endif
 #pragma once
 class GridChecksum
 {
 public:
  static inline uint32_t crc32(const void *data, size_t bytes)
  {
    return ::crc32(0L,(unsigned char *)data,bytes);
  }
 #ifdef USE_IPP
  static inline uint32_t crc32c(const void* data, size_t bytes)
  {
      uint32_t crc32c = ~(uint32_t)0;
      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
      ippsSwapBytes_32u_I(&crc32c, 1);
      return ~crc32c;
  }
 #endif
  template <typename T>
  static inline std::string sha256_string(const std::vector<T> &hash)
  {
    std::stringstream sha;
    std::string       s;
    for(unsigned int i = 0; i < hash.size(); i++) 
    { 
        sha << std::hex << static_cast<unsigned int>(hash[i]);
    }
    s = sha.str();
    return s;
  }
  static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
  {
    std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
    SHA256_CTX sha256;
    SHA256_Init  (&sha256);
    SHA256_Update(&sha256, data,bytes);
    SHA256_Final (&hash[0], &sha256);
    return hash;
  }
  static inline std::vector<int> sha256_seeds(const std::string &s)
  {
    std::vector<int> seeds;
    std::vector<unsigned char> uchars = sha256((void *)s.c_str(),s.size());
    for(int i=0;i<uchars.size();i++) seeds.push_back(uchars[i]);
    return seeds;
  }
 };
 /*
 int main(int argc,char **argv)
 {
  std::string s("The quick brown fox jumps over the lazy dog");
  auto csum = GridChecksum::sha256_seeds(s);
  std::cout << "SHA256 sum is 0x";
  for(int i=0;i<csum.size;i++) { 
    std::cout << std::hex << csum[i];
  }
  std::cout << std::endl;
 }
 */
--- a/Grid/util/version.cc
+++ b/Grid/util/version.cc
@@ -1,12 +0,0 @@
 #include <iostream>
 #include "Version.h"
 namespace Grid {
  void printHash(){
 #ifdef GITHASH
    std::cout << "Current Grid git commit hash=" << GITHASH << std::endl;
 #else
    std::cout << "Current Grid git commit hash is undefined. Check makefile." << std::endl;
 #endif
 #undef GITHASH
 }
 }
--- a/HMC/Makefile.am
+++ b/HMC/Makefile.am
@@ -1,6 +0,0 @@
 SUBDIRS = . 
 include Make.inc
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -1,198 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  using namespace Grid::QCD;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 20;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  20;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 10;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; // Scale factor two
  RealD c   = 0.0;
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 1.0e-2;
  OFRp.hi       = 64;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 14;
  OFRp.precision= 40;
  std::vector<Real> hasenbusch({ 0.1 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(4);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  //  FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
  //  DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
  //  DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
  //  ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  //  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
  OneFlavourRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
  //  TwoFlavourRationalTesterPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion1F(StrangeOp,OFRp);
  //  TwoFlavourPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion2F(StrangeOp,CG,CG);
  //  Level1.push_back(&StrangePseudoFermion2F);
  //  Level1.push_back(&StrangePseudoFermion);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@@ -1,452 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: 
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu
 Author: David Murphy
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
 #endif
 namespace Grid{ 
  namespace QCD{
  /*
   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
   *    -- Store the single prec action operator.
   *    -- Clone the gauge field from the operator function argument.
   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
   */
  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
  public:
    typedef typename FermionOperatorD::FermionField FieldD;
    typedef typename FermionOperatorF::FermionField FieldF;
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid4; //Grid for single-precision fields
    GridBase* SinglePrecGrid5; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    FermionOperatorF &FermOpF;
    FermionOperatorD &FermOpD;;
    SchurOperatorF &LinOpF;
    SchurOperatorD &LinOpD;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
 						    Integer maxinnerit, 
 						    Integer maxouterit, 
 						    GridBase* _sp_grid4, 
 						    GridBase* _sp_grid5, 
 						    FermionOperatorF &_FermOpF,
 						    FermionOperatorD &_FermOpD,
 						    SchurOperatorF   &_LinOpF,
 						    SchurOperatorD   &_LinOpD): 
      LinOpF(_LinOpF),
      LinOpD(_LinOpD),
      FermOpF(_FermOpF),
      FermOpD(_FermOpD),
      Tolerance(tol), 
      InnerTolerance(tol), 
      MaxInnerIterations(maxinnerit), 
      MaxOuterIterations(maxouterit), 
      SinglePrecGrid4(_sp_grid4),
      SinglePrecGrid5(_sp_grid5),
      OuterLoopNormMult(100.) 
    { 
      /* Debugging instances of objects; references are stored
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
      */
    };
    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
      // Assumption made in code to extract gauge field
      // We could avoid storing LinopD reference alltogether ?
      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
      ////////////////////////////////////////////////////////////////////////////////////
      // Must snarf a single precision copy of the gauge field in Linop_d argument
      ////////////////////////////////////////////////////////////////////////////////////
      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
      GridBase * GridPtrF = SinglePrecGrid4;
      GridBase * GridPtrD = FermOpD.Umu._grid;
      GaugeFieldF     U_f  (GridPtrF);
      GaugeLinkFieldF Umu_f(GridPtrF);
      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
      ////////////////////////////////////////////////////////////////////////////////////
      // Moving this to a Clone method of fermion operator would allow to duplicate the 
      // physics parameters and decrease gauge field copies
      ////////////////////////////////////////////////////////////////////////////////////
      GaugeLinkFieldD Umu_d(GridPtrD);
      for(int mu=0;mu<Nd*2;mu++){ 
 	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
 	precisionChange(Umu_f,Umu_d);
 	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
      }
      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
      ////////////////////////////////////////////////////////////////////////////////////
      // Could test to make sure that LinOpF and LinOpD agree to single prec?
      ////////////////////////////////////////////////////////////////////////////////////
      /*
      GridBase *Fgrid = psi._grid;
      FieldD tmp2(Fgrid);
      FieldD tmp1(Fgrid);
      LinOpU.Op(src,tmp1);
      LinOpD.Op(src,tmp2);
      std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
      std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
      std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
      std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
      tmp1=tmp1-tmp2;
      std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
      */
      ////////////////////////////////////////////////////////////////////////////////////
      // Make a mixed precision conjugate gradient
      ////////////////////////////////////////////////////////////////////////////////////
      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
      MPCG(src,psi);
    }
  };
 }};
 int main(int argc, char **argv) {
  using namespace Grid;
  using namespace Grid::QCD;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef MobiusFermionF FermionActionF;
  typedef MobiusEOFAFermionR FermionEOFAAction;
  typedef MobiusEOFAFermionF FermionEOFAActionF;
  typedef typename FermionAction::FermionField FermionField;
  typedef typename FermionActionF::FermionField FermionFieldF;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //  MD.name    = std::string("Leap Frog");
  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  MD.name    = std::string("Force Gradient");
  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  //  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 6;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 590;
  HMCparams.Trajectories     = 1000;
  HMCparams.NoMetropolisUntil=  0;
  //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 10;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; 
  RealD c   = 0.0;
  std::vector<Real> hasenbusch({ 0.1, 0.3, 0.6 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  std::vector<int> latt  = GridDefaultLatt();
  std::vector<int> mpi   = GridDefaultMpi();
  std::vector<int> simdF = GridDefaultSimd(Nd,vComplexF::Nsimd());
  std::vector<int> simdD = GridDefaultSimd(Nd,vComplexD::Nsimd());
  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi);
  auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(GridPtrF);
  auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtrF);
  auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtrF);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  LatticeGaugeFieldF UF(GridPtrF);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  FermionActionF::ImplParams ParamsF(boundary);
  double ActionStoppingCondition     = 1e-10;
  double DerivativeStoppingCondition = 1e-6;
  double MaxCGIterations = 30000;
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(8);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
  typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
  typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
  typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
  typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
  // DJM: setup for EOFA ratio (Mobius)
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 0.1;
  OFRp.hi       = 25.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-9;
  OFRp.degree   = 14;
  OFRp.precision= 50;
  MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
  MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
  ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations);
  ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations);
 #ifdef MIXED_PRECISION
  const int MX_inner = 1000;
  // Mixed precision EOFA
  LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
  LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
  LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF);
  LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
  MxPCG_EOFA ActionCGL(ActionStoppingCondition,
 		       MX_inner,
 		       MaxCGIterations,
 		       GridPtrF,
 		       FrbGridF,
 		       Strange_Op_LF,Strange_Op_L,
 		       Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
 			   MX_inner,
 			   MaxCGIterations,
 			   GridPtrF,
 			   FrbGridF,
 			   Strange_Op_LF,Strange_Op_L,
 			   Strange_LinOp_LF,Strange_LinOp_L);
  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
 		       MX_inner,
 		       MaxCGIterations,
 		       GridPtrF,
 		       FrbGridF,
 		       Strange_Op_RF,Strange_Op_R,
 		       Strange_LinOp_RF,Strange_LinOp_R);
  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
 			   MX_inner,
 			   MaxCGIterations,
 			   GridPtrF,
 			   FrbGridF,
 			   Strange_Op_RF,Strange_Op_R,
 			   Strange_LinOp_RF,Strange_LinOp_R);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG, 
 	 ActionCGL, ActionCGR,
 	 DerivativeCGL, DerivativeCGR,
 	 OFRp, true);
 #else
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
    EOFA(Strange_Op_L, Strange_Op_R, 
 	 ActionCG,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 OFRp, true);
 #endif
  Level1.push_back(&EOFA);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  //////////////////////////////////////////////////////////////
  // Forced to replicate the MxPCG and DenominatorsF etc.. because
  // there is no convenient way to "Clone" physics params from double op
  // into single op for any operator pair.
  // Same issue prevents using MxPCG in the Heatbath step
  //////////////////////////////////////////////////////////////
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  std::vector<MxPCG *> ActionMPCG;
  std::vector<MxPCG *> MPCG;
  std::vector<FermionActionF *> DenominatorsF;
  std::vector<LinearOperatorD *> LinOpD;
  std::vector<LinearOperatorF *> LinOpF; 
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
 #ifdef MIXED_PRECISION
    ////////////////////////////////////////////////////////////////////////////
    // Mixed precision CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsF));
    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
    LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
    MPCG.push_back(new MxPCG(DerivativeStoppingCondition,
 			     MX_inner,
 			     MaxCGIterations,
 			     GridPtrF,
 			     FrbGridF,
 			     *DenominatorsF[h],*Denominators[h],
 			     *LinOpF[h], *LinOpD[h]) );
    ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
 				   MX_inner,
 				   MaxCGIterations,
 				   GridPtrF,
 				   FrbGridF,
 				   *DenominatorsF[h],*Denominators[h],
 				   *LinOpF[h], *LinOpD[h]) );
    // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
    Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
 #else
    ////////////////////////////////////////////////////////////////////////////
    // Standard CG for 2f force
    ////////////////////////////////////////////////////////////////////////////
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG));
 #endif
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@@ -1,198 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./tests/Test_hmc_EODWFRatio.cc
 Copyright (C) 2015-2016
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
  using namespace Grid::QCD;
  Grid_init(&argc, &argv);
  int threads = GridThread::GetThreads();
  // here make a routine to print all the relevant information on the run
  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
   // Typedefs to simplify notation
  typedef WilsonImplR FermionImplPolicy;
  typedef MobiusFermionR FermionAction;
  typedef typename FermionAction::FermionField FermionField;
  typedef Grid::XmlReader       Serialiser;
  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
  IntegratorParameters MD;
  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
  //  MD.name    = std::string("Leap Frog");
  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
  //  MD.name    = std::string("Force Gradient");
  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
  MD.name    = std::string("MinimumNorm2");
  MD.MDsteps = 20;
  MD.trajL   = 1.0;
  HMCparameters HMCparams;
  HMCparams.StartTrajectory  = 30;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
  //  HMCparams.StartingType     =std::string("ColdStart");
  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);
  // Grid from the command line arguments --grid and --mpi
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
  CheckpointerParameters CPparams;
  CPparams.config_prefix = "ckpoint_EODWF_lat";
  CPparams.rng_prefix    = "ckpoint_EODWF_rng";
  CPparams.saveInterval  = 10;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
  RNGModuleParameters RNGpar;
  RNGpar.serial_seeds = "1 2 3 4 5";
  RNGpar.parallel_seeds = "6 7 8 9 10";
  TheHMC.Resources.SetRNGSeeds(RNGpar);
  // Construct observables
  // here there is too much indirection 
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
  TheHMC.Resources.AddObservable<PlaqObs>();
  //////////////////////////////////////////////
  const int Ls      = 16;
  Real beta         = 2.13;
  Real light_mass   = 0.01;
  Real strange_mass = 0.04;
  Real pv_mass      = 1.0;
  RealD M5  = 1.8;
  RealD b   = 1.0; 
  RealD c   = 0.0;
  // FIXME:
  // Same in MC and MD 
  // Need to mix precision too
  OneFlavourRationalParams OFRp;
  OFRp.lo       = 4.0e-3;
  OFRp.hi       = 30.0;
  OFRp.MaxIter  = 10000;
  OFRp.tolerance= 1.0e-10;
  OFRp.degree   = 16;
  OFRp.precision= 50;
  std::vector<Real> hasenbusch({ 0.1 });
  auto GridPtr   = TheHMC.Resources.GetCartesian();
  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
  IwasakiGaugeActionR GaugeAction(beta);
  // temporarily need a gauge field
  LatticeGaugeField U(GridPtr);
  // These lines are unecessary if BC are all periodic
  std::vector<Complex> boundary = {1,1,1,-1};
  FermionAction::ImplParams Params(boundary);
  double StoppingCondition = 1e-10;
  double MaxCGIterations = 30000;
  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
  ActionLevel<HMCWrapper::Field> Level1(1);
  ActionLevel<HMCWrapper::Field> Level2(4);
  ////////////////////////////////////
  // Strange action
  ////////////////////////////////////
  //  FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
  //  DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
  //  DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
  //  ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
  FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
  FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params);
  OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> StrangePseudoFermion(StrangePauliVillarsOp,StrangeOp,OFRp);
  Level1.push_back(&StrangePseudoFermion);
  ////////////////////////////////////
  // up down action
  ////////////////////////////////////
  std::vector<Real> light_den;
  std::vector<Real> light_num;
  int n_hasenbusch = hasenbusch.size();
  light_den.push_back(light_mass);
  for(int h=0;h<n_hasenbusch;h++){
    light_den.push_back(hasenbusch[h]);
    light_num.push_back(hasenbusch[h]);
  }
  light_num.push_back(pv_mass);
  std::vector<FermionAction *> Numerators;
  std::vector<FermionAction *> Denominators;
  std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
  for(int h=0;h<n_hasenbusch+1;h++){
    std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl;
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
    Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
  }
  for(int h=0;h<n_hasenbusch+1;h++){
    Level1.push_back(Quotients[h]);
  }
  /////////////////////////////////////////////////////////////
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level2.push_back(&GaugeAction);
  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
  /////////////////////////////////////////////////////////////
  // HMC parameters are serialisable
  std::cout << GridLogMessage << " Running the HMC "<< std::endl;
  TheHMC.Run();  // no smearing
  Grid_finalize();
 } // main
--- a/HMC/README
+++ b/HMC/README
@@ -1,109 +0,0 @@
 ********************************************************************
 TODO: 
 ********************************************************************
 i) Got mixed precision in 2f and EOFA force and action solves.
   But need mixed precision in the heatbath solve. Best for Fermop to have a "clone" method, to
   reduce the number of solver and action objects. Needed ideally for the EOFA heatbath.
   15% perhaps
   Combine with 2x trajectory length?
 ii) Rational on EOFA HB  -- relax order
                         -- Test the approx as per David email
 Resume / roll.sh 
 ----------------------------------------------------------------
 - 16^3 Currently 10 traj per hour
 - EOFA use a different derivative solver from action solver
 - EOFA fix Davids hack to the SchurRedBlack guessing
 *** Reduce precision/tolerance  in EOFA with second CG param.                          (10% speed up)
 *** Force gradient - reduced precision solve for the gradient                          (4/3x speedup)
 *** Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
    -- Store the single prec action operator.
    -- Clone the gauge field from the operator function argument.
    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
 *** Mixed precision CG into EOFA portion         
 *** Further reduce precision in forces to 10^-6 ?
 *** Overall: a 3x or so is still possible => 500s -> 160s and 20 traj per hour on 16^3.
 - Use mixed precision CG in HMC                           
 - SchurRedBlack.h: stop use of operator function; use LinearOperator or similar instead.
 - Or make an OperatorFunction for mixed precision as a wrapper
 ********************************************************************
 * Signed off 2+1f HMC with Hasenbush and strange RHMC 16^3 x 32 DWF Ls=16 Plaquette 0.5883 ish
 * Signed off 2+1f HMC with Hasenbush and strange EOFA 16^3 x 32 DWF Ls=16 Plaquette 0.5883 ish
 * Wilson plaquette cross checked against CPS and literature GwilsonFnone
 ********************************************************************
 ********************************************************************
 * RHMC: Timesteps & eigenranges matched from previous CPS 16^3 x 32 runs:
 ********************************************************************
 ****
 Strange (m=0.04)  has eigenspan 
 **** 
 16^3 done as 1+1+1 with separate PV's. 
 /dirac1/archive/QCDOC/host/QCDDWF/DWF/2+1f/16nt32/IWASAKI/b2.13/ls16/M1_8/ms0.04/mu0.01/rhmc_multitimescale/evol5/work
 ****
 2+1f 16^3  - [ 4e^-4, 2.42 ]    for strange
 ****
 24^3 done as 1+1+1 at strange, and single quotient https://arxiv.org/pdf/0804.0473.pdf Eq 83,
 ****
 double lambda_low =   4.0000000000000002e-04 <- strange
 double lambda_low =   1.0000000000000000e-02 <- pauli villars
 And high = 2.5
 Array bsn_mass[3] = { 
 double bsn_mass[0] =   1.0000000000000000e+00
 double bsn_mass[1] =   1.0000000000000000e+00
 double bsn_mass[2] =   1.0000000000000000e+00
 }
 Array frm_mass[3] = { 
 double frm_mass[0] =   4.0000000000000001e-02
 double frm_mass[1] =   4.0000000000000001e-02
 double frm_mass[2] =   4.0000000000000001e-02
 }
 ***
 32^3 
 /dirac1/archive/QCDOC/host/QCDDWF/DWF/2+1f/32nt64/IWASAKI/b2.25/ls16/M1_8/ms0.03/mu0.004/evol6/work
 ***
 Similar det scheme
 double lambda_low =   4.0000000000000002e-04
 double lambda_low =   1.0000000000000000e-02
 Array bsn_mass[3] = { 
 double bsn_mass[0] =   1.0000000000000000e+00
 double bsn_mass[1] =   1.0000000000000000e+00
 double bsn_mass[2] =   1.0000000000000000e+00
 }
 Array frm_mass[3] = { 
 double frm_mass[0] =   3.0000000000000002e-02
 double frm_mass[1] =   3.0000000000000002e-02
 double frm_mass[2] =   3.0000000000000002e-02
 }
 ********************************************************************
 * Grid: Power method bounds check
 ********************************************************************
 - Finding largest eigenvalue approx 25 not 2.5
 - Conventions:
 Grid MpcDagMpc based on:
   (Moo-Moe Mee^-1 Meo)^dag(Moo-Moe Mee^-1 Meo)
 - with  Moo = 5-M5 = 3.2
 - CPS use(d) Moo = 1
 - Eigenrange in Grid is 3.2^2 rescaled so factor of 10 accounted for
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -1,746 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/A2AMatrix.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef A2A_Matrix_hpp_
 #define A2A_Matrix_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/TimerArray.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
 #ifdef USE_MKL
 #include "mkl.h"
 #include "mkl_cblas.h"
 #endif
 #ifndef HADRONS_A2AM_NAME 
 #define HADRONS_A2AM_NAME "a2aMatrix"
 #endif
 #ifndef HADRONS_A2AM_IO_TYPE
 #define HADRONS_A2AM_IO_TYPE ComplexF
 #endif
 #define HADRONS_A2AM_PARALLEL_IO
 BEGIN_HADRONS_NAMESPACE
 // general A2A matrix set based on Eigen tensors and Grid-allocated memory
 // Dimensions:
 //   0 - ext - external field (momentum, EM field, ...)
 //   1 - str - spin-color structure
 //   2 - t   - timeslice
 //   3 - i   - left  A2A mode index
 //   4 - j   - right A2A mode index
 template <typename T>
 using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
 template <typename T>
 using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
 template <typename T>
 using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
 /******************************************************************************
 *                      Abstract class for A2A kernels                        *
 ******************************************************************************/
 template <typename T, typename Field>
 class A2AKernel
 {
 public:
    A2AKernel(void) = default;
    virtual ~A2AKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
                          const unsigned int orthogDim, double &time) = 0;
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
 };
 /******************************************************************************
 *                  Class to handle A2A matrix block HDF5 I/O                 *
 ******************************************************************************/
 template <typename T>
 class A2AMatrixIo
 {
 public:
    // constructors
    A2AMatrixIo(void) = default;
    A2AMatrixIo(std::string filename, std::string dataname, 
                const unsigned int nt, const unsigned int ni = 0,
                const unsigned int nj = 0);
    // destructor
    ~A2AMatrixIo(void) = default;
    // access
    unsigned int getNi(void) const;
    unsigned int getNj(void) const;
    unsigned int getNt(void) const;
    size_t       getSize(void) const;
    // file allocation
    template <typename MetadataType>
    void initFile(const MetadataType &d, const unsigned int chunkSize);
    // block I/O
    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
                   const unsigned int blockSizei, const unsigned int blockSizej);
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
    template <template <class> class Vec, typename VecT>
    void load(Vec<VecT> &v, double *tRead = nullptr);
 private:
    std::string  filename_{""}, dataname_{""};
    unsigned int nt_{0}, ni_{0}, nj_{0};
 };
 /******************************************************************************
 *                  Wrapper for A2A matrix block computation                  *
 ******************************************************************************/
 template <typename T, typename Field, typename MetadataType, typename TIo = T>
 class A2AMatrixBlockComputation
 {
 private:
    struct IoHelper
    {
        A2AMatrixIo<TIo> io;
        MetadataType     md;
        unsigned int     e, s, i, j;
    };
    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
 public:
    // constructor
    A2AMatrixBlockComputation(GridBase *grid,
                              const unsigned int orthogDim,
                              const unsigned int next,
                              const unsigned int nstr,
                              const unsigned int blockSize,
                              const unsigned int cacheBlockSize,
                              TimerArray *tArray = nullptr);
    // execution
    void execute(const std::vector<Field> &left, 
                 const std::vector<Field> &right,
                 A2AKernel<T, Field> &kernel,
                 const FilenameFn &ionameFn,
                 const FilenameFn &filenameFn,
                 const MetadataFn &metadataFn);
 private:
    // I/O handler
    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
 private:
    TimerArray            *tArray_;
    GridBase              *grid_;
    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
    Vector<T>             mCache_;
    Vector<TIo>           mBuf_;
    std::vector<IoHelper> nodeIo_;
 };
 /******************************************************************************
 *                       A2A matrix contraction kernels                       *
 ******************************************************************************/
 class A2AContraction
 {
 public:
    // accTrMul(acc, a, b): acc += tr(a*b)
    template <typename C, typename MatLeft, typename MatRight>
    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
    {
        if ((MatLeft::Options == Eigen::RowMajor) and
            (MatRight::Options == Eigen::ColMajor))
        {
            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
            {
                C tmp;
 #ifdef USE_MKL
                dotuRow(tmp, r, a, b);
 #else
                tmp = a.row(r).conjugate().dot(b.col(r));
 #endif
                parallel_critical
                {
                    acc += tmp;
                }
            }
        }
        else
        {
            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
            {
                C tmp;
 #ifdef USE_MKL 
                dotuCol(tmp, c, a, b);
 #else
                tmp = a.col(c).conjugate().dot(b.row(c));
 #endif
                parallel_critical
                {
                    acc += tmp;
                }
            }
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
    {
        double n = a.rows()*a.cols();
        return 8.*n;
    }
    // mul(res, a, b): res = a*b
 #ifdef USE_MKL
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexD, Opts...> &res, 
                           const Mat<ComplexD, Opts...> &a, 
                           const Mat<ComplexD, Opts...> &b)
    {
        static const ComplexD one(1., 0.), zero(0., 0.);
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
    template <template <class, int...> class Mat, int... Opts>
    static inline void mul(Mat<ComplexF, Opts...> &res, 
                           const Mat<ComplexF, Opts...> &a, 
                           const Mat<ComplexF, Opts...> &b)
    {
        static const ComplexF one(1., 0.), zero(0., 0.);
        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
        {
            res.resize(a.rows(), b.cols());
        }
        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
        {
            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
        {
            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
                        res.data(), res.rows());
        }
    }
 #else
    template <typename Mat>
    static inline void mul(Mat &res, const Mat &a, const Mat &b)
    {
        res = a*b;
    }
 #endif
    template <typename Mat>
    static inline double mulFlops(const Mat &a, const Mat &b)
    {
        double nr = a.rows(), nc = a.cols();
        return nr*nr*(6.*nc + 2.*(nc - 1.));
    }
 private:
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aRow, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aRow*a.cols();
            aInc = 1;
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aRow;
            aInc = a.rows();
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aRow;
            bInc = b.cols();
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aRow*b.rows();
            bInc = 1;
        }
    }
 #ifdef USE_MKL
    template <typename C, typename MatLeft, typename MatRight>
    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
                                    unsigned int &bInc, const unsigned int aCol, 
                                    const MatLeft &a, const MatRight &b)
    {
        if (MatLeft::Options == Eigen::RowMajor)
        {
            aPt  = a.data() + aCol;
            aInc = a.cols();
        }
        else if (MatLeft::Options == Eigen::ColMajor)
        {
            aPt  = a.data() + aCol*a.rows();
            aInc = 1;
        }
        if (MatRight::Options == Eigen::RowMajor)
        {
            bPt  = b.data() + aCol*b.cols();
            bInc = 1;
        }
        else if (MatRight::Options == Eigen::ColMajor)
        {
            bPt  = b.data() + aCol;
            bInc = b.rows();
        }
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexF *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
    }
    template <typename MatLeft, typename MatRight>
    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
                               const MatLeft &a, const MatRight &b)
    {
        const ComplexD *aPt, *bPt;
        unsigned int   aInc, bInc;
        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
    }
 #endif
 };
 /******************************************************************************
 *                     A2AMatrixIo template implementation                    *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
                            const unsigned int nt, const unsigned int ni,
                            const unsigned int nj)
 : filename_(filename), dataname_(dataname)
 , nt_(nt), ni_(ni), nj_(nj)
 {}
 // access //////////////////////////////////////////////////////////////////////
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNt(void) const
 {
    return nt_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNi(void) const
 {
    return ni_;
 }
 template <typename T>
 unsigned int A2AMatrixIo<T>::getNj(void) const
 {
    return nj_;
 }
 template <typename T>
 size_t A2AMatrixIo<T>::getSize(void) const
 {
    return nt_*ni_*nj_*sizeof(T);
 }
 // file allocation /////////////////////////////////////////////////////////////
 template <typename T>
 template <typename MetadataType>
 void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
 {
 #ifdef HAVE_HDF5
    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
                                   static_cast<hsize_t>(ni_), 
                                   static_cast<hsize_t>(nj_)},
                            chunk = {static_cast<hsize_t>(nt_), 
                                     static_cast<hsize_t>(chunkSize), 
                                     static_cast<hsize_t>(chunkSize)};
    H5NS::DataSpace         dataspace(dim.size(), dim.data());
    H5NS::DataSet           dataset;
    H5NS::DSetCreatPropList plist;
    // create empty file just with metadata
    {
        Hdf5Writer writer(filename_);
        write(writer, dataname_, d);
    }
    // create the dataset
    Hdf5Reader reader(filename_, false);
    push(reader, dataname_);
    auto &group = reader.getGroup();
    plist.setChunk(chunk.size(), chunk.data());
    plist.setFletcher32();
    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 // block I/O ///////////////////////////////////////////////////////////////////
 template <typename T>
 void A2AMatrixIo<T>::saveBlock(const T *data, 
                               const unsigned int i, 
                               const unsigned int j,
                               const unsigned int blockSizei,
                               const unsigned int blockSizej)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_, false);
    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
                         offset = {0, static_cast<hsize_t>(i),
                                   static_cast<hsize_t>(j)},
                         stride = {1, 1, 1},
                         block  = {1, 1, 1}; 
    H5NS::DataSpace      memspace(count.size(), count.data()), dataspace;
    H5NS::DataSet        dataset;
    size_t               shift;
    push(reader, dataname_);
    auto &group = reader.getGroup();
    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    dataspace   = dataset.getSpace();
    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                              stride.data(), block.data());
    dataset.write(data, Hdf5Type<T>::type(), memspace, dataspace);
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 template <typename T>
 void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
                               const unsigned int ext, const unsigned int str,
                               const unsigned int i, const unsigned int j)
 {
    unsigned int blockSizei = m.dimension(3);
    unsigned int blockSizej = m.dimension(4);
    unsigned int nstr       = m.dimension(1);
    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
 }
 template <typename T>
 template <template <class> class Vec, typename VecT>
 void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
 {
 #ifdef HAVE_HDF5
    Hdf5Reader           reader(filename_);
    std::vector<hsize_t> hdim;
    H5NS::DataSet        dataset;
    H5NS::DataSpace      dataspace;
    H5NS::CompType       datatype;
    push(reader, dataname_);
    auto &group = reader.getGroup();
    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
    datatype    = dataset.getCompType();
    dataspace   = dataset.getSpace();
    hdim.resize(dataspace.getSimpleExtentNdims());
    dataspace.getSimpleExtentDims(hdim.data());
    if ((nt_*ni_*nj_ != 0) and
        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
    {
        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
            + std::to_string(hdim[2]) + ", expected "
            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
            + std::to_string(nj_));
    }
    else if (ni_*nj_ == 0)
    {
        if (hdim[0] != nt_)
        {
            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
                + std::to_string(hdim[0]) + ", expected "
                + std::to_string(nt_) + ")");
        }
        ni_ = hdim[1];
        nj_ = hdim[2];
    }
    A2AMatrix<T>         buf(ni_, nj_);
    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
                                     static_cast<hsize_t>(nj_)},
                         stride   = {1, 1, 1},
                         block    = {1, 1, 1},
                         memCount = {static_cast<hsize_t>(ni_),
                                     static_cast<hsize_t>(nj_)};
    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
    std::cout << "Loading timeslice";
    std::cout.flush();
    *tRead = 0.;
    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
    {
        unsigned int         t      = tp1 - 1;
        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
        if (t % 10 == 0)
        {
            std::cout << " " << t;
            std::cout.flush();
        }
        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
                                  stride.data(), block.data());
        if (tRead) *tRead -= usecond();    
        dataset.read(buf.data(), datatype, memspace, dataspace);
        if (tRead) *tRead += usecond();
        v[t] = buf.template cast<VecT>();
    }
    std::cout << std::endl;
 #else
    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
 #endif
 }
 /******************************************************************************
 *               A2AMatrixBlockComputation template implementation            *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::A2AMatrixBlockComputation(GridBase *grid,
                            const unsigned int orthogDim,
                            const unsigned int next, 
                            const unsigned int nstr,
                            const unsigned int blockSize, 
                            const unsigned int cacheBlockSize,
                            TimerArray *tArray)
 : grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
 , next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
 , tArray_(tArray)
 {
    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
 }
 #define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
 #define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
 #define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
 // execution ///////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::execute(const std::vector<Field> &left, const std::vector<Field> &right,
          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
 {
    //////////////////////////////////////////////////////////////////////////
    // i,j   is first  loop over blockSize_ factors
    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
    // iii,jjj are loops within cacheBlock
    // Total index is sum of these  i+ii+iii etc...
    //////////////////////////////////////////////////////////////////////////
    int    N_i = left.size();
    int    N_j = right.size();
    double flops, bytes, t_kernel;
    double nodes = grid_->NodeCount();
    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
    for(int i=0;i<N_i;i+=blockSize_)
    for(int j=0;j<N_j;j+=blockSize_)
    {
        // Get the W and V vectors for this block^2 set of terms
        int N_ii = MIN(N_i-i,blockSize_);
        int N_jj = MIN(N_j-j,blockSize_);
        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
        LOG(Message) << "All-to-all matrix block " 
                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
                     << std::endl;
        // Series of cache blocked chunks of the contractions within this block
        flops    = 0.0;
        bytes    = 0.0;
        t_kernel = 0.0;
        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
        {
            double t;
            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
            START_TIMER("kernel");
            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
            STOP_TIMER("kernel");
            t_kernel += t;
            flops    += kernel.flops(N_iii, N_jjj);
            bytes    += kernel.bytes(N_iii, N_jjj);
            START_TIMER("cache copy");
            parallel_for_nest5(int e =0;e<next_;e++)
            for(int s =0;s< nstr_;s++)
            for(int t =0;t< nt_;t++)
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            {
                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
            }
            STOP_TIMER("cache copy");
        }
        // perf
        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
                     << " Gflop/s/node " << std::endl;
        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
                     << " GB/s/node "  << std::endl;
        // IO
        double       blockSize, ioTime;
        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
        LOG(Message) << "Writing block to disk" << std::endl;
        ioTime = -GET_TIMER("IO: write block");
        START_TIMER("IO: total");
        makeFileDir(filenameFn(0, 0), grid_);
 #ifdef HADRONS_A2AM_PARALLEL_IO
        grid_->Barrier();
        // make task list for current node
        nodeIo_.clear();
        for(int f = myRank; f < next_*nstr_; f += nRank)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = f/nstr_;
            h.s  = f % nstr_;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            nodeIo_.push_back(h);
        }
        // parallel IO
        for (auto &h: nodeIo_)
        {
            saveBlock(mBlock, h);
        }
        grid_->Barrier();
 #else
        // serial IO, for testing purposes only
        for(int e = 0; e < next_; e++)
        for(int s = 0; s < nstr_; s++)
        {
            IoHelper h;
            h.i  = i;
            h.j  = j;
            h.e  = e;
            h.s  = s;
            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
            h.md = metadataFn(h.e, h.s);
            saveBlock(mfBlock, h);
        }
 #endif
        STOP_TIMER("IO: total");
        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
        ioTime    += GET_TIMER("IO: write block");
        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
                     << ioTime  << " us (" 
                     << blockSize/ioTime*1.0e6/1024/1024
                     << " MB/s)" << std::endl;
    }
 }
 // I/O handler /////////////////////////////////////////////////////////////////
 template <typename T, typename Field, typename MetadataType, typename TIo>
 void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
 ::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
 {
    if ((h.i == 0) and (h.j == 0))
    {
        START_TIMER("IO: file creation");
        h.io.initFile(h.md, blockSize_);
        STOP_TIMER("IO: file creation");
    }
    START_TIMER("IO: write block");
    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
    STOP_TIMER("IO: write block");
 }
 #undef START_TIMER
 #undef STOP_TIMER
 #undef GET_TIMER
 END_HADRONS_NAMESPACE
 #endif // A2A_Matrix_hpp_
--- a/Hadrons/A2AVectors.hpp
+++ b/Hadrons/A2AVectors.hpp
@@ -1,342 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/A2AVectors.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: fionnoh <fionnoh@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef A2A_Vectors_hpp_
 #define A2A_Vectors_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Environment.hpp>
 #include <Hadrons/Solver.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                 Class to generate V & W all-to-all vectors                 *
 ******************************************************************************/
 template <typename FImpl>
 class A2AVectorsSchurDiagTwo
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    SOLVER_TYPE_ALIASES(FImpl,);
 public:
    A2AVectorsSchurDiagTwo(FMat &action, Solver &solver);
    virtual ~A2AVectorsSchurDiagTwo(void) = default;
    void makeLowModeV(FermionField &vout, 
                      const FermionField &evec, const Real &eval);
    void makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
                        const FermionField &evec, const Real &eval);
    void makeLowModeW(FermionField &wout, 
                      const FermionField &evec, const Real &eval);
    void makeLowModeW5D(FermionField &wout_4d, FermionField &wout_5d, 
                        const FermionField &evec, const Real &eval);
    void makeHighModeV(FermionField &vout, const FermionField &noise);
    void makeHighModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
                         const FermionField &noise_5d);
    void makeHighModeW(FermionField &wout, const FermionField &noise);
    void makeHighModeW5D(FermionField &vout_5d, FermionField &wout_5d, 
                         const FermionField &noise_5d);
 private:
    FMat                                     &action_;
    Solver                                   &solver_;
    GridBase                                 *fGrid_, *frbGrid_, *gGrid_;
    bool                                     is5d_;
    FermionField                             src_o_, sol_e_, sol_o_, tmp_, tmp5_;
    SchurDiagTwoOperator<FMat, FermionField> op_;
 };
 /******************************************************************************
 *                  Methods for V & W all-to-all vectors I/O                  *
 ******************************************************************************/
 class A2AVectorsIo
 {
 public:
    struct Record: Serializable
    {
        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
                                        unsigned int, index);
        Record(void): index(0) {}
    };
 public:
    template <typename Field>
    static void write(const std::string fileStem, std::vector<Field> &vec, 
                      const bool multiFile, const int trajectory = -1);
    template <typename Field>
    static void read(std::vector<Field> &vec, const std::string fileStem,
                     const bool multiFile, const int trajectory = -1);
 private:
    static inline std::string vecFilename(const std::string stem, const int traj, 
                                          const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (multiFile)
        {
            return stem + t;
        }
        else
        {
            return stem + t + ".bin";
        }
    }
 };
 /******************************************************************************
 *               A2AVectorsSchurDiagTwo template implementation               *
 ******************************************************************************/
 template <typename FImpl>
 A2AVectorsSchurDiagTwo<FImpl>::A2AVectorsSchurDiagTwo(FMat &action, Solver &solver)
 : action_(action)
 , solver_(solver)
 , fGrid_(action_.FermionGrid())
 , frbGrid_(action_.FermionRedBlackGrid())
 , gGrid_(action_.GaugeGrid())
 , src_o_(frbGrid_)
 , sol_e_(frbGrid_)
 , sol_o_(frbGrid_)
 , tmp_(frbGrid_)
 , tmp5_(fGrid_)
 , op_(action_)
 {}
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV(FermionField &vout, const FermionField &evec, const Real &eval)
 {
    src_o_ = evec;
    src_o_.checkerboard = Odd;
    pickCheckerboard(Even, sol_e_, vout);
    pickCheckerboard(Odd, sol_o_, vout);
    /////////////////////////////////////////////////////
    // v_ie = -(1/eval_i) * MeeInv Meo MooInv evec_i
    /////////////////////////////////////////////////////
    action_.MooeeInv(src_o_, tmp_);
    assert(tmp_.checkerboard == Odd);
    action_.Meooe(tmp_, sol_e_);
    assert(sol_e_.checkerboard == Even);
    action_.MooeeInv(sol_e_, tmp_);
    assert(tmp_.checkerboard == Even);
    sol_e_ = (-1.0 / eval) * tmp_;
    assert(sol_e_.checkerboard == Even);
    /////////////////////////////////////////////////////
    // v_io = (1/eval_i) * MooInv evec_i
    /////////////////////////////////////////////////////
    action_.MooeeInv(src_o_, tmp_);
    assert(tmp_.checkerboard == Odd);
    sol_o_ = (1.0 / eval) * tmp_;
    assert(sol_o_.checkerboard == Odd);
    setCheckerboard(vout, sol_e_);
    assert(sol_e_.checkerboard == Even);
    setCheckerboard(vout, sol_o_);
    assert(sol_o_.checkerboard == Odd);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, const FermionField &evec, const Real &eval)
 {
    makeLowModeV(vout_5d, evec, eval);
    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW(FermionField &wout, const FermionField &evec, const Real &eval)
 {
    src_o_ = evec;
    src_o_.checkerboard = Odd;
    pickCheckerboard(Even, sol_e_, wout);
    pickCheckerboard(Odd, sol_o_, wout);
    /////////////////////////////////////////////////////
    // w_ie = - MeeInvDag MoeDag Doo evec_i
    /////////////////////////////////////////////////////
    op_.Mpc(src_o_, tmp_);
    assert(tmp_.checkerboard == Odd);
    action_.MeooeDag(tmp_, sol_e_);
    assert(sol_e_.checkerboard == Even);
    action_.MooeeInvDag(sol_e_, tmp_);
    assert(tmp_.checkerboard == Even);
    sol_e_ = (-1.0) * tmp_;
    /////////////////////////////////////////////////////
    // w_io = Doo evec_i
    /////////////////////////////////////////////////////
    op_.Mpc(src_o_, sol_o_);
    assert(sol_o_.checkerboard == Odd);
    setCheckerboard(wout, sol_e_);
    assert(sol_e_.checkerboard == Even);
    setCheckerboard(wout, sol_o_);
    assert(sol_o_.checkerboard == Odd);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW5D(FermionField &wout_4d, 
                                                   FermionField &wout_5d, 
                                                   const FermionField &evec, 
                                                   const Real &eval)
 {
    makeLowModeW(tmp5_, evec, eval);
    action_.DminusDag(tmp5_, wout_5d);
    action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV(FermionField &vout, 
                                                  const FermionField &noise)
 {
    solver_(vout, noise);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV5D(FermionField &vout_4d, 
                                                    FermionField &vout_5d, 
                                                    const FermionField &noise)
 {
    if (noise._grid->Dimensions() == fGrid_->Dimensions() - 1)
    {
        action_.ImportPhysicalFermionSource(noise, tmp5_);
    }
    else
    {
        tmp5_ = noise;
    }
    makeHighModeV(vout_5d, tmp5_);
    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW(FermionField &wout, 
                                                  const FermionField &noise)
 {
    wout = noise;
 }
 template <typename FImpl>
 void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d, 
                                                    FermionField &wout_5d, 
                                                    const FermionField &noise)
 {
    if (noise._grid->Dimensions() == fGrid_->Dimensions() - 1)
    {
        action_.ImportUnphysicalFermion(noise, wout_5d);
        wout_4d = noise;
    }
    else
    {
        wout_5d = noise;
        action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
    }
 }
 /******************************************************************************
 *               all-to-all vectors I/O template implementation               *
 ******************************************************************************/
 template <typename Field>
 void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
                         const bool multiFile, const int trajectory)
 {
    Record       record;
    GridBase     *grid = vec[0]._grid;
    ScidacWriter binWriter(grid->IsBoss());
    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
    if (multiFile)
    {
        std::string fullFilename;
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
            LOG(Message) << "Writing vector " << i << std::endl;
            makeFileDir(fullFilename, grid);
            binWriter.open(fullFilename);
            record.index = i;
            binWriter.writeScidacFieldRecord(vec[i], record);
            binWriter.close();
        }
    }
    else
    {
        makeFileDir(filename, grid);
        binWriter.open(filename);
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            LOG(Message) << "Writing vector " << i << std::endl;
            record.index = i;
            binWriter.writeScidacFieldRecord(vec[i], record);
        }
        binWriter.close();
    }
 }
 template <typename Field>
 void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
                        const bool multiFile, const int trajectory)
 {
    Record       record;
    ScidacReader binReader;
    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
    if (multiFile)
    {
        std::string fullFilename;
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
            LOG(Message) << "Reading vector " << i << std::endl;
            binReader.open(fullFilename);
            binReader.readScidacFieldRecord(vec[i], record);
            binReader.close();
            if (record.index != i)
            {
                HADRONS_ERROR(Io, "vector index mismatch");
            }
        }
    }
    else
    {
        binReader.open(filename);
        for (unsigned int i = 0; i < vec.size(); ++i)
        {
            LOG(Message) << "Reading vector " << i << std::endl;
            binReader.readScidacFieldRecord(vec[i], record);
            if (record.index != i)
            {
                HADRONS_ERROR(Io, "vector index mismatch");
            }
        }
        binReader.close();
    }
 }
 END_HADRONS_NAMESPACE
 #endif // A2A_Vectors_hpp_
--- a/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -1,288 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Application.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Application.hpp>
 #include <Hadrons/GeneticScheduler.hpp>
 #include <Hadrons/Modules.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 #define BIG_SEP "================"
 #define SEP     "----------------"
 /******************************************************************************
 *                       Application implementation                           *
 ******************************************************************************/
 // constructors ////////////////////////////////////////////////////////////////
 #define MACOUT(macro)    macro              << " (" << #macro << ")"
 #define MACOUTS(macro) HADRONS_STR(macro) << " (" << #macro << ")"
 Application::Application(void)
 {
    initLogger();
    auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
    if (dim.size())
    {
        locVol_ = 1;
        for (unsigned int d = 0; d < dim.size(); ++d)
        {
            loc[d]  /= mpi[d];
            locVol_ *= loc[d];
        }
        LOG(Message) << "====== HADRONS APPLICATION INITIALISATION ======" << std::endl;
        LOG(Message) << "** Dimensions" << std::endl;
        LOG(Message) << "Global lattice: " << dim << std::endl;
        LOG(Message) << "MPI partition : " << mpi << std::endl;
        LOG(Message) << "Local lattice : " << loc << std::endl;
        LOG(Message) << std::endl;
        LOG(Message) << "** Default parameters (and associated C macros)" << std::endl;
        LOG(Message) << "ASCII output precision  : " << MACOUT(DEFAULT_ASCII_PREC) << std::endl;
        LOG(Message) << "Fermion implementation  : " << MACOUTS(FIMPLBASE) << std::endl;
        LOG(Message) << "z-Fermion implementation: " << MACOUTS(ZFIMPLBASE) << std::endl;
        LOG(Message) << "Scalar implementation   : " << MACOUTS(SIMPLBASE) << std::endl;
        LOG(Message) << "Gauge implementation    : " << MACOUTS(GIMPLBASE) << std::endl;
        LOG(Message) << "Eigenvector base size   : " 
                    << MACOUT(HADRONS_DEFAULT_LANCZOS_NBASIS) << std::endl;
        LOG(Message) << "Schur decomposition     : " << MACOUTS(HADRONS_DEFAULT_SCHUR) << std::endl;
        LOG(Message) << std::endl;
    }
 }
 Application::Application(const Application::GlobalPar &par)
 : Application()
 {
    setPar(par);
 }
 Application::Application(const std::string parameterFileName)
 : Application()
 {
    parameterFileName_ = parameterFileName;
 }
 // access //////////////////////////////////////////////////////////////////////
 void Application::setPar(const Application::GlobalPar &par)
 {
    par_ = par;
 }
 const Application::GlobalPar & Application::getPar(void)
 {
    return par_;
 }
 // execute /////////////////////////////////////////////////////////////////////
 void Application::run(void)
 {
    LOG(Message) << "====== HADRONS APPLICATION START ======" << std::endl;
    if (!parameterFileName_.empty() and (vm().getNModule() == 0))
    {
        parseParameterFile(parameterFileName_);
    }
    if (getPar().runId.empty())
    {
        HADRONS_ERROR(Definition, "run id is empty");
    }
    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
                 << BinaryIO::latticeWriteMaxRetry << std::endl;
    vm().setRunId(getPar().runId);
    vm().printContent();
    env().printContent();
    if (getPar().saveSchedule or getPar().scheduleFile.empty())
    {
        schedule();
        if (getPar().saveSchedule)
        {
            std::string filename;
            filename = (getPar().scheduleFile.empty()) ? 
                         "hadrons.sched" : getPar().scheduleFile;
            saveSchedule(filename);
        }
    }
    else
    {
        loadSchedule(getPar().scheduleFile);
    }
    printSchedule();
    if (!getPar().graphFile.empty())
    {
        makeFileDir(getPar().graphFile, env().getGrid());
        vm().dumpModuleGraph(getPar().graphFile);
    }
    configLoop();
 }
 // parse parameter file ////////////////////////////////////////////////////////
 class ObjectId: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ObjectId,
                                    std::string, name,
                                    std::string, type);
 };
 void Application::parseParameterFile(const std::string parameterFileName)
 {
    XmlReader reader(parameterFileName);
    GlobalPar par;
    ObjectId  id;
    LOG(Message) << "Building application from '" << parameterFileName << "'..." << std::endl;
    read(reader, "parameters", par);
    setPar(par);
    if (!push(reader, "modules"))
    {
        HADRONS_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
                              + parameterFileName + "'");
    }
    if (!push(reader, "module"))
    {
        HADRONS_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
                              + parameterFileName + "'");
    }
    do
    {
        read(reader, "id", id);
        vm().createModule(id.name, id.type, reader);
    } while (reader.nextElement("module"));
    pop(reader);
    pop(reader);
 }
 void Application::saveParameterFile(const std::string parameterFileName, unsigned int prec)
 {
    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
    if (env().getGrid()->IsBoss())
    {
        XmlWriter          writer(parameterFileName);
        writer.setPrecision(prec);
        ObjectId           id;
        const unsigned int nMod = vm().getNModule();
        write(writer, "parameters", getPar());
        push(writer, "modules");
        for (unsigned int i = 0; i < nMod; ++i)
        {
            push(writer, "module");
            id.name = vm().getModuleName(i);
            id.type = vm().getModule(i)->getRegisteredName();
            write(writer, "id", id);
            vm().getModule(i)->saveParameters(writer, "options");
            pop(writer);
        }
        pop(writer);
        pop(writer);
    }
 }
 // schedule computation ////////////////////////////////////////////////////////
 void Application::schedule(void)
 {
    if (!scheduled_ and !loadedSchedule_)
    {
        program_   = vm().schedule(par_.genetic);
        scheduled_ = true;
    }
 }
 void Application::saveSchedule(const std::string filename)
 {
    LOG(Message) << "Saving current schedule to '" << filename << "'..."
                 << std::endl;
    if (env().getGrid()->IsBoss())
    {
        TextWriter               writer(filename);
        std::vector<std::string> program;
        if (!scheduled_)
        {
            HADRONS_ERROR(Definition, "Computation not scheduled");
        }
        for (auto address: program_)
        {
            program.push_back(vm().getModuleName(address));
        }
        write(writer, "schedule", program);
    }
 }
 void Application::loadSchedule(const std::string filename)
 {
    TextReader               reader(filename);
    std::vector<std::string> program;
    LOG(Message) << "Loading schedule from '" << filename << "'..."
                 << std::endl;
    read(reader, "schedule", program);
    program_.clear();
    for (auto &name: program)
    {
        program_.push_back(vm().getModuleAddress(name));
    }
    loadedSchedule_ = true;
    scheduled_      = true;
 }
 void Application::printSchedule(void)
 {
    if (!scheduled_ and !loadedSchedule_)
    {
        HADRONS_ERROR(Definition, "Computation not scheduled");
    }
    auto peak = vm().memoryNeeded(program_);
    LOG(Message) << "Schedule (memory needed: " << sizeString(peak) << "):"
                 << std::endl;
    for (unsigned int i = 0; i < program_.size(); ++i)
    {
        LOG(Message) << std::setw(4) << i + 1 << ": "
                     << vm().getModuleName(program_[i]) << std::endl;
    }
 }
 // loop on configurations //////////////////////////////////////////////////////
 void Application::configLoop(void)
 {
    auto range = par_.trajCounter;
    for (unsigned int t = range.start; t < range.end; t += range.step)
    {
        LOG(Message) << BIG_SEP << " Starting measurement for trajectory " << t
                     << " " << BIG_SEP << std::endl;
        vm().setTrajectory(t);
        vm().executeProgram(program_);
    }
    LOG(Message) << BIG_SEP << " End of measurement " << BIG_SEP << std::endl;
    env().freeAll();
 }
--- a/Hadrons/Archive/Modules/ScalarVP.cc
+++ b/Hadrons/Archive/Modules/ScalarVP.cc
@@ -1,564 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/ScalarVP.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Hadrons/Modules/MScalar/ScalarVP.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /*
 * Scalar QED vacuum polarisation up to O(alpha)
 *
 * Conserved vector 2-point function diagram notation:
 *        _______
 *       /       \
 * U_nu *         * U_mu
 *       \_______/
 *
 *                (   adj(S(a\hat{nu}|x)) U_mu(x) S(0|x+a\hat{mu}) U_nu(0)    )
 *          = 2 Re(                             -                             )
 *                ( adj(S(a\hat{nu}|x+a\hat{mu})) adj(U_mu(x)) S(0|x) U_nu(0) )
 *  
 *
 *            _______
 *           /       \
 * free = 1 *         * 1
 *           \_______/
 *
 *
 *
 *             _______
 *            /       \
 * S = iA_nu *         * iA_mu
 *            \_______/
 *
 *
 *         Delta_1
 *         ___*___
 *        /       \
 * X = 1 *         * 1
 *        \___*___/
 *         Delta_1
 *
 *          Delta_1                     Delta_1
 *          ___*___                     ___*___
 *         /       \                   /       \
 *      1 *         * iA_mu  +  iA_nu *         * 1
 *         \_______/                   \_______/
 * 4C =        _______                     _______
 *            /       \                   /       \
 *      +  1 *         * iA_mu  +  iA_nu *         * 1
 *            \___*___/                   \___*___/
 *             Delta_1                     Delta_1
 *
 *     Delta_1   Delta_1
 *          _*___*_             _______
 *         /       \           /       \
 * 2E = 1 *         * 1  +  1 *         * 1
 *         \_______/           \_*___*_/
 *                         Delta_1   Delta_1
 *
 *          Delta_2
 *          ___*___             _______
 *         /       \           /       \
 * 2T = 1 *         * 1  +  1 *         * 1
 *         \_______/           \___*___/
 *                              Delta_2
 *
 *
 *                    _______
 *                   /       \
 * srcT = -A_nu^2/2 *         * 1
 *                   \_______/
 *
 *
 *
 *            _______
 *           /       \
 * snkT = 1 *         * -A_mu^2/2
 *           \_______/
 *
 * Full VP to O(alpha) = free + q^2*(S+X+4C+2E+2T+srcT+snkT)
 */
 /******************************************************************************
 *                  TScalarVP implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TScalarVP::TScalarVP(const std::string name)
 : Module<ScalarVPPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TScalarVP::getInput(void)
 {
    prop0Name_ = par().scalarProp + "_0";
    propQName_ = par().scalarProp + "_Q";
    propSunName_ = par().scalarProp + "_Sun";
    propTadName_ = par().scalarProp + "_Tad";
 	std::vector<std::string> in = {par().emField, prop0Name_, propQName_,
                                   propSunName_, propTadName_};
    return in;
 }
 std::vector<std::string> TScalarVP::getOutput(void)
 {
    std::vector<std::string> out;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        // out.push_back(getName() + "_propQ_" + std::to_string(mu));
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            out.push_back(getName() + "_" + std::to_string(mu)
                          + "_" + std::to_string(nu));
        }
    }
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TScalarVP::setup(void)
 {
 	freeMomPropName_ = FREEMOMPROP(static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass);
 	GFSrcName_ = par().scalarProp + "_DinvSrc";
    fftName_   = par().scalarProp + "_fft";
 	phaseName_.clear();
 	muPropQName_.clear();
    vpTensorName_.clear();
    momPhaseName_.clear();
 	for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
        muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu));
        std::vector<std::string> vpTensorName_mu;
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu)
                                      + "_" + std::to_string(nu));
        }
        vpTensorName_.push_back(vpTensorName_mu);
    }
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
        }
    }
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 	{
 	    envCreateLat(ScalarField, muPropQName_[mu]);
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            envCreateLat(ScalarField, vpTensorName_[mu][nu]);
        }
 	}
    if (!par().output.empty())
    {
        momPhasesDone_ = env().hasCreatedObject(momPhaseName_[0]);
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            envCacheLat(ScalarField, momPhaseName_[i_p]);
        }
    }
    envTmpLat(ScalarField, "buf");
    envTmpLat(ScalarField, "result");
    envTmpLat(ScalarField, "Amu");
    envTmpLat(ScalarField, "Usnk");
    envTmpLat(ScalarField, "tmpProp");
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TScalarVP::execute(void)
 {
    // CACHING ANALYTIC EXPRESSIONS
    makeCaches();
    Complex ci(0.0,1.0);
    Real    q        = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().charge;
    auto    &prop0   = envGet(ScalarField, prop0Name_);
    auto    &propQ   = envGet(ScalarField, propQName_);
    auto    &propSun = envGet(ScalarField, propSunName_);
    auto    &propTad = envGet(ScalarField, propTadName_);
    auto    &GFSrc   = envGet(ScalarField, GFSrcName_);
    auto    &G       = envGet(ScalarField, freeMomPropName_);
    auto    &fft     = envGet(FFT, fftName_);
    phase_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        auto &phmu = envGet(ScalarField, phaseName_[mu]);
        phase_.push_back(&phmu);
    }
    // PROPAGATORS FROM SHIFTED SOURCES
    LOG(Message) << "Computing O(q) charged scalar propagators..."
                 << std::endl;
    std::vector<ScalarField *> muPropQ;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        auto &propmu = envGet(ScalarField, muPropQName_[mu]);
        // -G*momD1*G*F*tau_mu*Src (momD1 = F*D1*Finv)
        propmu = adj(*phase_[mu])*GFSrc;
        momD1(propmu, fft);
        propmu = -G*propmu;
        fft.FFT_all_dim(propmu, propmu, FFT::backward);
        muPropQ.push_back(&propmu);
    }
    // CONTRACTIONS
    auto        &A = envGet(EmField, par().emField);
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, result);
    envGetTmp(ScalarField, Amu);
    envGetTmp(ScalarField, Usnk);
    envGetTmp(ScalarField, tmpProp);
    TComplex    Anu0, Usrc;
    std::vector<int> coor0 = {0, 0, 0, 0};
    std::vector<std::vector<ScalarField *> > vpTensor;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        std::vector<ScalarField *> vpTensor_mu;
        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
        {
            auto &vpmunu = envGet(ScalarField, vpTensorName_[mu][nu]);
            vpTensor_mu.push_back(&vpmunu);
        }
        vpTensor.push_back(vpTensor_mu);
    }
    // Prepare output data structure if necessary
    Result outputData;
    if (!par().output.empty())
    {
        outputData.projection.resize(par().outputMom.size());
        outputData.lattice_size = env().getGrid()->_fdimensions;
        outputData.mass = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass;
        outputData.charge = q;
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
            outputData.projection[i_p].pi.resize(env().getNd());
            outputData.projection[i_p].pi_free.resize(env().getNd());
            outputData.projection[i_p].pi_2E.resize(env().getNd());
            outputData.projection[i_p].pi_2T.resize(env().getNd());
            outputData.projection[i_p].pi_S.resize(env().getNd());
            outputData.projection[i_p].pi_4C.resize(env().getNd());
            outputData.projection[i_p].pi_X.resize(env().getNd());
            outputData.projection[i_p].pi_srcT.resize(env().getNd());
            outputData.projection[i_p].pi_snkT.resize(env().getNd());
            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
            {
                outputData.projection[i_p].pi[nu].resize(env().getNd());
                outputData.projection[i_p].pi_free[nu].resize(env().getNd());
                outputData.projection[i_p].pi_2E[nu].resize(env().getNd());
                outputData.projection[i_p].pi_2T[nu].resize(env().getNd());
                outputData.projection[i_p].pi_S[nu].resize(env().getNd());
                outputData.projection[i_p].pi_4C[nu].resize(env().getNd());
                outputData.projection[i_p].pi_X[nu].resize(env().getNd());
                outputData.projection[i_p].pi_srcT[nu].resize(env().getNd());
                outputData.projection[i_p].pi_snkT[nu].resize(env().getNd());
            }
        }
    }
    // Do contractions
    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
    {
        peekSite(Anu0, peekLorentz(A, nu), coor0);
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..."
                         << std::endl;
            Amu = peekLorentz(A, mu);
            // free
            tmpProp = Cshift(prop0, nu, -1);     // S_0(0|x-a\hat{\nu})
                                                 // = S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            vpContraction(result, prop0, tmpProp, Usrc, mu);
            *vpTensor[mu][nu] = result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_free[mu][nu], result,
                            i_p);
                }
            }
            tmpProp = result; // Just using tmpProp as a temporary ScalarField
                              // here (buf is modified by calls to writeVP())
            // srcT
            result = tmpProp * (-0.5)*Anu0*Anu0;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_srcT[mu][nu], result,
                            i_p);
                }
            }
            // snkT
            result = tmpProp * (-0.5)*Amu*Amu;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_snkT[mu][nu], result,
                            i_p);
                }
            }
            // S
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = ci*Anu0;
            Usnk    = ci*Amu;
            vpContraction(result, prop0, tmpProp, Usrc, Usnk, mu);
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_S[mu][nu], result,
                            i_p);
                }
            }
            // 4C
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            Usnk    = ci*Amu;
            vpContraction(result, propQ, tmpProp, Usrc, Usnk, mu);
            Usrc    = ci*Anu0;
            vpContraction(buf, propQ, tmpProp, Usrc, mu);
            result += buf;
            vpContraction(buf, prop0, *muPropQ[nu], Usrc, mu);
            result += buf;
            Usrc = Complex(1.0,0.0);
            Usnk = ci*Amu;
            vpContraction(buf, prop0, *muPropQ[nu], Usrc, Usnk, mu);
            result += buf;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_4C[mu][nu], result,
                            i_p);
                }
            }
            // X
            Usrc = Complex(1.0,0.0);
            vpContraction(result, propQ, *muPropQ[nu], Usrc, mu);
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_X[mu][nu], result,
                            i_p);
                }
            }
            // 2E
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            vpContraction(result, propSun, tmpProp, Usrc, mu);
            tmpProp = Cshift(propSun, nu, -1);     // S_\Sigma(0|x-a\hat{\nu})
                               //(Note: <S(0|x-a\hat{\nu})> = <S(a\hat{\nu}|x)>)
            vpContraction(buf, prop0, tmpProp, Usrc, mu);
            result += buf;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_2E[mu][nu], result,
                            i_p);
                }
            }
            // 2T
            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
            Usrc    = Complex(1.0,0.0);
            vpContraction(result, propTad, tmpProp, Usrc, mu);
            tmpProp = Cshift(propTad, nu, -1);     // S_T(0|x-a\hat{\nu})
            vpContraction(buf, prop0, tmpProp, Usrc, mu);
            result += buf;
            *vpTensor[mu][nu] += q*q*result;
            // Do momentum projections if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi_2T[mu][nu], result,
                            i_p);
                }
            }
            // Do momentum projections of full VP if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pi[mu][nu],
                            *vpTensor[mu][nu], i_p);
                }
            }
        }
    }
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected HVP to '"
                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "HVP", outputData);
    }
 }
 void TScalarVP::makeCaches(void)
 {
    envGetTmp(ScalarField, buf);
    if ( (!par().output.empty()) && (!momPhasesDone_) )
    {
        LOG(Message) << "Caching phases for momentum projections..."
                     << std::endl;
        std::vector<int> &l = env().getGrid()->_fdimensions;
        Complex          ci(0.0,1.0);
        // Calculate phase factors
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
            momph_ip = zero;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                Real twoPiL = M_PI*2./l[j];
                LatticeCoordinate(buf, j);
                buf = mom[j]*twoPiL*buf;
                momph_ip = momph_ip + buf;
            }
            momph_ip = exp(-ci*momph_ip);
            momPhase_.push_back(&momph_ip);
        }
    }
 }
 void TScalarVP::vpContraction(ScalarField &vp,
                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
                   TComplex u_src, ScalarField &u_snk, int mu)
 {
    // Note: this function assumes a point source is used.
    vp = adj(prop_nu_x) * u_snk * Cshift(prop_0_x, mu, 1) * u_src;
    vp -= Cshift(adj(prop_nu_x), mu, 1) * adj(u_snk) * prop_0_x * u_src;
    vp = 2.0*real(vp);
 }
 void TScalarVP::vpContraction(ScalarField &vp,
                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
                   TComplex u_src, int mu)
 {
    // Note: this function assumes a point source is used.
    vp = adj(prop_nu_x) * Cshift(prop_0_x, mu, 1) * u_src;
    vp -= Cshift(adj(prop_nu_x), mu, 1) * prop_0_x * u_src;
    vp = 2.0*real(vp);
 }
 void TScalarVP::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
 {
    std::vector<TComplex>   vecBuf;
    envGetTmp(ScalarField, buf);
    buf = vp*(*momPhase_[i_p]);
    sliceSum(buf, vecBuf, Tp);
    projection.resize(vecBuf.size());
    for (unsigned int t = 0; t < vecBuf.size(); ++t)
    {
        projection[t] = TensorRemove(vecBuf[t]);
    }
 }
 void TScalarVP::momD1(ScalarField &s, FFT &fft)
 {
    auto        &A = envGet(EmField, par().emField);
    Complex     ci(0.0,1.0);
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, result);
    envGetTmp(ScalarField, Amu);
    result = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = (*phase_[mu])*s;
        fft.FFT_all_dim(buf, buf, FFT::backward);
        buf = Amu*buf;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result - ci*buf;
    }
    fft.FFT_all_dim(s, s, FFT::backward);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Amu = peekLorentz(A, mu);
        buf = Amu*s;
        fft.FFT_all_dim(buf, buf, FFT::forward);
        result = result + ci*adj(*phase_[mu])*buf;
    }
    s = result;
 }
--- a/Hadrons/Archive/Modules/ScalarVP.hpp
+++ b/Hadrons/Archive/Modules/ScalarVP.hpp
@@ -1,129 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/ScalarVP.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalar_ScalarVP_hpp_
 #define Hadrons_MScalar_ScalarVP_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Scalar vacuum polarisation                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class ScalarVPPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar,
                                    std::string, emField,
                                    std::string, scalarProp,
                                    std::string, output,
                                    std::vector<std::string>, outputMom);
 };
 class TScalarVP: public Module<ScalarVPPar>
 {
 public:
    BASIC_TYPE_ALIASES(SIMPL,);
    typedef PhotonR::GaugeField     EmField;
    typedef PhotonR::GaugeLinkField EmComp;
    class Result: Serializable
    {
    public:
        class Projection: Serializable
        {
        public:
            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
                                            std::vector<int>,     momentum,
                                            std::vector<std::vector<std::vector<Complex>>>, pi,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_free,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_2E,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_2T,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_S,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_4C,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_X,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_srcT,
                                            std::vector<std::vector<std::vector<Complex>>>, pi_snkT);
        };
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<int>,        lattice_size,
                                        double,                  mass,
                                        double,                  charge,
                                        std::vector<Projection>, projection);
    };
 public:
    // constructor
    TScalarVP(const std::string name);
    // destructor
    virtual ~TScalarVP(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void makeCaches(void);
    // conserved vector two-point contraction
    void vpContraction(ScalarField &vp,
                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
                       TComplex u_src, ScalarField &u_snk, int mu);
    // conserved vector two-point contraction with unit gauge link at sink
    void vpContraction(ScalarField &vp,
                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
                       TComplex u_src, int mu);
    // write momentum-projected vacuum polarisation to file(s)
    void project(std::vector<Complex> &projection, const ScalarField &vp,
                 int i_p);
    // momentum-space Delta_1 insertion
    void momD1(ScalarField &s, FFT &fft);
 private:
    bool                                        momPhasesDone_;
    std::string                                 freeMomPropName_, GFSrcName_,
                                                prop0Name_, propQName_,
                                                propSunName_, propTadName_,
                                                fftName_;
    std::vector<std::string>                    phaseName_, muPropQName_,
                                                momPhaseName_;
    std::vector<std::vector<std::string> >      vpTensorName_;
    std::vector<ScalarField *>                  phase_, momPhase_;
 };
 MODULE_REGISTER(ScalarVP, TScalarVP, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_ScalarVP_hpp_
--- a/Hadrons/Archive/Modules/TestSeqConserved.cc
+++ b/Hadrons/Archive/Modules/TestSeqConserved.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqConserved.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MUtilities;
 template class Grid::Hadrons::MUtilities::TTestSeqConserved<FIMPL>;
--- a/Hadrons/Archive/Modules/TestSeqConserved.hpp
+++ b/Hadrons/Archive/Modules/TestSeqConserved.hpp
@@ -1,186 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqConserved.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_TestSeqConserved_hpp_
 #define Hadrons_MUtilities_TestSeqConserved_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
  Ward Identity contractions using sequential propagators.
 -----------------------------
 * options:
 - q:      point source propagator, 5D if available (string)
 - qSeq:   result of sequential insertion of conserved current using q (string)
 - action: action used for computation of q (string)
 - origin: string giving point source origin of q (string)
 - t_J:    time at which sequential current is inserted (int)
 - mu:     Lorentz index of current inserted (int)
 - curr:   current type, e.g. vector/axial (Current)
 */
 /******************************************************************************
 *                            TestSeqConserved                                *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class TestSeqConservedPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqConservedPar,
                                    std::string,  q,
                                    std::string,  qSeq,
                                    std::string,  action,
                                    std::string,  origin,
                                    unsigned int, t_J,
                                    unsigned int, mu,
                                    Current,      curr);
 };
 template <typename FImpl>
 class TTestSeqConserved: public Module<TestSeqConservedPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTestSeqConserved(const std::string name);
    // destructor
    virtual ~TTestSeqConserved(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(TestSeqConserved, TTestSeqConserved<FIMPL>, MUtilities);
 /******************************************************************************
 *                     TTestSeqConserved implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TTestSeqConserved<FImpl>::TTestSeqConserved(const std::string name)
 : Module<TestSeqConservedPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TTestSeqConserved<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().qSeq, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TTestSeqConserved<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqConserved<FImpl>::setup(void)
 {
    auto Ls = env().getObjectLs(par().q);
    if (Ls != env().getObjectLs(par().action))
    {
        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
    }
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqConserved<FImpl>::execute(void)
 {
    // Check sequential insertion of current gives same result as conserved 
    // current sink upon contraction. Assume q uses a point source.
    auto                  &q    = envGet(PropagatorField, par().q);
    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
    auto                  &act  = envGet(FMat, par().action);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma::Algebra        gA = (par().curr == Current::Axial) ?
                                  Gamma::Algebra::Gamma5 :
                                  Gamma::Algebra::Identity;
    Gamma                 g(gA);
    SitePropagator        qSite;
    Complex               test_S, test_V, check_S, check_V;
    std::vector<TComplex> check_buf;
    std::vector<int>      siteCoord;
    envGetTmp(PropagatorField, tmp);
    envGetTmp(LatticeComplex, c);
    siteCoord = strToVec<int>(par().origin);
    peekSite(qSite, qSeq, siteCoord);
    test_S = trace(qSite*g);
    test_V = trace(qSite*g*Gamma::gmu[par().mu]);
    act.ContractConservedCurrent(q, q, tmp, par().curr, par().mu);
    c = trace(tmp*g);
    sliceSum(c, check_buf, Tp);
    check_S = TensorRemove(check_buf[par().t_J]);
    c = trace(tmp*g*Gamma::gmu[par().mu]);
    sliceSum(c, check_buf, Tp);
    check_V = TensorRemove(check_buf[par().t_J]);
    LOG(Message) << "Test S  = " << abs(test_S)   << std::endl;
    LOG(Message) << "Test V  = " << abs(test_V) << std::endl;
    LOG(Message) << "Check S = " << abs(check_S) << std::endl;
    LOG(Message) << "Check V = " << abs(check_V) << std::endl;
    // Check difference = 0
    check_S -= test_S;
    check_V -= test_V;
    LOG(Message) << "Consistency check for sequential conserved " 
                 << par().curr << " current insertion: " << std::endl; 
    LOG(Message) << "Diff S  = " << abs(check_S) << std::endl;
    LOG(Message) << "Diff V  = " << abs(check_V) << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TestSeqConserved_hpp_
--- a/Hadrons/Archive/Modules/TestSeqGamma.cc
+++ b/Hadrons/Archive/Modules/TestSeqGamma.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqGamma.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MUtilities;
 template class Grid::Hadrons::MUtilities::TTestSeqGamma<FIMPL>;
--- a/Hadrons/Archive/Modules/TestSeqGamma.hpp
+++ b/Hadrons/Archive/Modules/TestSeqGamma.hpp
@@ -1,150 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/TestSeqGamma.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MUtilities_TestSeqGamma_hpp_
 #define Hadrons_MUtilities_TestSeqGamma_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                              TestSeqGamma                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MUtilities)
 class TestSeqGammaPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(TestSeqGammaPar,
                                    std::string,    q,
                                    std::string,    qSeq,
                                    std::string,    origin,
                                    Gamma::Algebra, gamma,
                                    unsigned int,   t_g);
 };
 template <typename FImpl>
 class TTestSeqGamma: public Module<TestSeqGammaPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TTestSeqGamma(const std::string name);
    // destructor
    virtual ~TTestSeqGamma(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(TestSeqGamma, TTestSeqGamma<FIMPL>, MUtilities);
 /******************************************************************************
 *                      TTestSeqGamma implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TTestSeqGamma<FImpl>::TTestSeqGamma(const std::string name)
 : Module<TestSeqGammaPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TTestSeqGamma<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().qSeq};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TTestSeqGamma<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqGamma<FImpl>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TTestSeqGamma<FImpl>::execute(void)
 {
    auto                  &q    = envGet(PropagatorField, par().q);
    auto                  &qSeq = envGet(PropagatorField, par().qSeq);
    Gamma                 g5(Gamma::Algebra::Gamma5);
    Gamma                 g(par().gamma);
    SitePropagator        qSite;
    Complex               test, check;
    std::vector<TComplex> check_buf;
    std::vector<int>      siteCoord;
    // Check sequential insertion of gamma matrix gives same result as 
    // insertion of gamma at sink upon contraction. Assume q uses a point 
    // source.
    envGetTmp(LatticeComplex, c);
    siteCoord = strToVec<int>(par().origin);
    peekSite(qSite, qSeq, siteCoord);
    test = trace(g*qSite);
    c = trace(adj(g)*g5*adj(q)*g5*g*q);
    sliceSum(c, check_buf, Tp);
    check = TensorRemove(check_buf[par().t_g]);
    LOG(Message) << "Seq Result = " << abs(test)  << std::endl;
    LOG(Message) << "Reference  = " << abs(check) << std::endl;
    // Check difference = 0
    check -= test;
    LOG(Message) << "Consistency check for sequential " << par().gamma  
                 << " insertion = " << abs(check) << std::endl;
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_TestSeqGamma_hpp_
--- a/Hadrons/Archive/Modules/VPCounterTerms.cc
+++ b/Hadrons/Archive/Modules/VPCounterTerms.cc
@@ -1,260 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/VPCounterTerms.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MScalar/VPCounterTerms.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MScalar;
 /******************************************************************************
 *                  TVPCounterTerms implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 TVPCounterTerms::TVPCounterTerms(const std::string name)
 : Module<VPCounterTermsPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 std::vector<std::string> TVPCounterTerms::getInput(void)
 {
    std::vector<std::string> in = {par().source};
    return in;
 }
 std::vector<std::string> TVPCounterTerms::getOutput(void)
 {
    std::vector<std::string> out;
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 void TVPCounterTerms::setup(void)
 {
 	freeMomPropName_ = FREEMOMPROP(par().mass);
    phaseName_.clear();
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
    }
    GFSrcName_ = getName() + "_DinvSrc";
    phatsqName_ = getName() + "_pHatSquared";
    prop0Name_ = getName() + "_freeProp";
    twoscalarName_ = getName() + "_2scalarProp";
    psquaredName_ = getName() + "_psquaredProp";
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
        }
    }
    envCreateLat(ScalarField, freeMomPropName_);
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        envCreateLat(ScalarField, phaseName_[mu]);
    }
    envCreateLat(ScalarField, phatsqName_);
    envCreateLat(ScalarField, GFSrcName_);
    envCreateLat(ScalarField, prop0Name_);
    envCreateLat(ScalarField, twoscalarName_);
    envCreateLat(ScalarField, psquaredName_);
    if (!par().output.empty())
    {
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            envCacheLat(ScalarField, momPhaseName_[i_p]);
        }
    }
    envTmpLat(ScalarField, "buf");
    envTmpLat(ScalarField, "tmp_vp");
    envTmpLat(ScalarField, "vpPhase");
 }
 // execution ///////////////////////////////////////////////////////////////////
 void TVPCounterTerms::execute(void)
 {
 	auto &source = envGet(ScalarField, par().source);
    Complex     ci(0.0,1.0);
    FFT         fft(env().getGrid());
    envGetTmp(ScalarField, buf);
    envGetTmp(ScalarField, tmp_vp);
    // Momentum-space free scalar propagator
    auto &G = envGet(ScalarField, freeMomPropName_);
    SIMPL::MomentumSpacePropagator(G, par().mass);
    // Phases and hat{p}^2
    auto &phatsq = envGet(ScalarField, phatsqName_);
    std::vector<int> &l = env().getGrid()->_fdimensions;
    LOG(Message) << "Calculating shift phases..." << std::endl;
    phatsq = zero;
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
    {
        Real    twoPiL = M_PI*2./l[mu];
        auto &phmu  = envGet(ScalarField, phaseName_[mu]);
        LatticeCoordinate(buf, mu);
        phmu = exp(ci*twoPiL*buf);
        phase_.push_back(&phmu);
        buf = 2.*sin(.5*twoPiL*buf);
 		phatsq = phatsq + buf*buf;
    }
    // G*F*src
    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
    fft.FFT_all_dim(GFSrc, source, FFT::forward);
    GFSrc = G*GFSrc;
    // Position-space free scalar propagator
    auto &prop0       = envGet(ScalarField, prop0Name_);
    prop0 = GFSrc;
    fft.FFT_all_dim(prop0, prop0, FFT::backward);
    // Propagators for counter-terms
    auto &twoscalarProp        = envGet(ScalarField, twoscalarName_);
    auto &psquaredProp         = envGet(ScalarField, psquaredName_);
    twoscalarProp = G*GFSrc;
    fft.FFT_all_dim(twoscalarProp, twoscalarProp, FFT::backward);
    psquaredProp = G*phatsq*GFSrc;
    fft.FFT_all_dim(psquaredProp, psquaredProp, FFT::backward);
    // Prepare output data structure if necessary
    Result outputData;
    if (!par().output.empty())
    {
        outputData.projection.resize(par().outputMom.size());
        outputData.lattice_size = env().getGrid()->_fdimensions;
        outputData.mass = par().mass;
        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
        {
            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
            outputData.projection[i_p].twoScalar.resize(env().getNd());
            outputData.projection[i_p].threeScalar.resize(env().getNd());
            outputData.projection[i_p].pSquaredInsertion.resize(env().getNd());
            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
            {
                outputData.projection[i_p].twoScalar[nu].resize(env().getNd());
                outputData.projection[i_p].threeScalar[nu].resize(env().getNd());
                outputData.projection[i_p].pSquaredInsertion[nu].resize(env().getNd());
            }
            // Calculate phase factors
            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
            momph_ip = zero;
            for (unsigned int j = 0; j < env().getNd()-1; ++j)
            {
                Real twoPiL = M_PI*2./l[j];
                LatticeCoordinate(buf, j);
                buf = outputData.projection[i_p].momentum[j]*twoPiL*buf;
                momph_ip = momph_ip + buf;
            }
            momph_ip = exp(-ci*momph_ip);
            momPhase_.push_back(&momph_ip);
        }
    }
    // Contractions
    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
    {
    	buf = adj(Cshift(prop0, nu, -1));
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
        {
            // Two-scalar loop
            tmp_vp = buf * Cshift(prop0, mu, 1);
            tmp_vp -= Cshift(buf, mu, 1) * prop0;
            tmp_vp = 2.0*real(tmp_vp);
            // Output if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].twoScalar[mu][nu],
                            tmp_vp, i_p);
                }
            }
        	// Three-scalar loop (no vertex)
    		tmp_vp = buf * Cshift(twoscalarProp, mu, 1);
            tmp_vp -= Cshift(buf, mu, 1) * twoscalarProp;
            tmp_vp = 2.0*real(tmp_vp);
            // Output if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].threeScalar[mu][nu],
                            tmp_vp, i_p);
                }
            }
            // Three-scalar loop (hat{p}^2 insertion)
    		tmp_vp = buf * Cshift(psquaredProp, mu, 1);
            tmp_vp -= Cshift(buf, mu, 1) * psquaredProp;
            tmp_vp = 2.0*real(tmp_vp);
            // Output if necessary
            if (!par().output.empty())
            {
                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
                {
                    project(outputData.projection[i_p].pSquaredInsertion[mu][nu],
                            tmp_vp, i_p);
                }
            }
        }
    }
    // OUTPUT IF NECESSARY
    if (!par().output.empty())
    {
        LOG(Message) << "Saving momentum-projected correlators to '"
                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
                     << std::endl;
        saveResult(par().output, "scalar_loops", outputData);
    }
 }
 void TVPCounterTerms::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
 {
    std::vector<TComplex>   vecBuf;
    envGetTmp(ScalarField, vpPhase);
    vpPhase = vp*(*momPhase_[i_p]);
    sliceSum(vpPhase, vecBuf, Tp);
    projection.resize(vecBuf.size());
    for (unsigned int t = 0; t < vecBuf.size(); ++t)
    {
        projection[t] = TensorRemove(vecBuf[t]);
    }
 }
--- a/Hadrons/Archive/Modules/VPCounterTerms.hpp
+++ b/Hadrons/Archive/Modules/VPCounterTerms.hpp
@@ -1,103 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/VPCounterTerms.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: James Harrison <jch1g10@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MScalar_VPCounterTerms_hpp_
 #define Hadrons_MScalar_VPCounterTerms_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         VPCounterTerms                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MScalar)
 class VPCounterTermsPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(VPCounterTermsPar,
                                    std::string, source,
                                    double,      mass,
                                    std::string, output,
                                    std::vector<std::string>, outputMom);
 };
 class TVPCounterTerms: public Module<VPCounterTermsPar>
 {
 public:
    BASIC_TYPE_ALIASES(SIMPL,);
    class Result: Serializable
    {
    public:
        class Projection: Serializable
        {
        public:
            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
                                            std::vector<int>,     momentum,
                                            std::vector<std::vector<std::vector<Complex>>>, twoScalar,
                                            std::vector<std::vector<std::vector<Complex>>>, threeScalar,
                                            std::vector<std::vector<std::vector<Complex>>>, pSquaredInsertion);
        };
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<int>,        lattice_size,
                                        double,                  mass,
                                        std::vector<Projection>, projection);
    };
 public:
    // constructor
    TVPCounterTerms(const std::string name);
    // destructor
    virtual ~TVPCounterTerms(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    void project(std::vector<Complex> &projection, const ScalarField &vp, int i_p);
 private:
    std::string                freeMomPropName_, GFSrcName_, phatsqName_, prop0Name_,
                               twoscalarName_, twoscalarVertexName_,
                               psquaredName_, psquaredVertexName_;
    std::vector<std::string>   phaseName_, momPhaseName_;
    std::vector<ScalarField *> phase_, momPhase_;
 };
 MODULE_REGISTER(VPCounterTerms, TVPCounterTerms, MScalar);
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MScalar_VPCounterTerms_hpp_
--- a/Hadrons/Archive/Modules/WardIdentity.cc
+++ b/Hadrons/Archive/Modules/WardIdentity.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WardIdentity.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WardIdentity.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TWardIdentity<FIMPL>;
--- a/Hadrons/Archive/Modules/WardIdentity.hpp
+++ b/Hadrons/Archive/Modules/WardIdentity.hpp
@@ -1,224 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Archive/Modules/WardIdentity.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WardIdentity_hpp_
 #define Hadrons_MContraction_WardIdentity_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
  Ward Identity contractions
 -----------------------------
 * options:
 - q:          propagator, 5D if available (string)
 - action:     action module used for propagator solution (string)
 - mass:       mass of quark (double)
 - test_axial: whether or not to test PCAC relation.
 */
 /******************************************************************************
 *                              WardIdentity                                  *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class WardIdentityPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WardIdentityPar,
                                    std::string, q,
                                    std::string, action,
                                    double,      mass,
                                    bool,        test_axial);
 };
 template <typename FImpl>
 class TWardIdentity: public Module<WardIdentityPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWardIdentity(const std::string name);
    // destructor
    virtual ~TWardIdentity(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    unsigned int Ls_;
 };
 MODULE_REGISTER_TMP(WardIdentity, TWardIdentity<FIMPL>, MContraction);
 /******************************************************************************
 *                     TWardIdentity implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWardIdentity<FImpl>::TWardIdentity(const std::string name)
 : Module<WardIdentityPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWardIdentity<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().q, par().action};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWardIdentity<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWardIdentity<FImpl>::setup(void)
 {
    Ls_ = env().getObjectLs(par().q);
    if (Ls_ != env().getObjectLs(par().action))
    {
        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
    }
    envTmpLat(PropagatorField, "tmp");
    envTmpLat(PropagatorField, "vector_WI");
    if (par().test_axial)
    {
        envTmpLat(PropagatorField, "psi");
        envTmpLat(LatticeComplex,  "PP");
        envTmpLat(LatticeComplex,  "axial_defect");
        envTmpLat(LatticeComplex,  "PJ5q");
    }
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWardIdentity<FImpl>::execute(void)
 {
    LOG(Message) << "Performing Ward Identity checks for quark '" << par().q
                 << "'." << std::endl;
    auto  &q   = envGet(PropagatorField, par().q);
    auto  &act = envGet(FMat, par().action);
    Gamma g5(Gamma::Algebra::Gamma5);
    // Compute D_mu V_mu, D here is backward derivative.
    envGetTmp(PropagatorField, tmp);
    envGetTmp(PropagatorField, vector_WI);
    vector_WI    = zero;
    for (unsigned int mu = 0; mu < Nd; ++mu)
    {
        act.ContractConservedCurrent(q, q, tmp, Current::Vector, mu);
        tmp -= Cshift(tmp, mu, -1);
        vector_WI += tmp;
    }
    // Test ward identity D_mu V_mu = 0;
    LOG(Message) << "Vector Ward Identity check Delta_mu V_mu = " 
                 << norm2(vector_WI) << std::endl;
    if (par().test_axial)
    {
        envGetTmp(PropagatorField, psi);
        envGetTmp(LatticeComplex, PP);
        envGetTmp(LatticeComplex, axial_defect);
        envGetTmp(LatticeComplex, PJ5q);
        std::vector<TComplex> axial_buf;
        // Compute <P|D_mu A_mu>, D is backwards derivative.
        axial_defect = zero;
        for (unsigned int mu = 0; mu < Nd; ++mu)
        {
            act.ContractConservedCurrent(q, q, tmp, Current::Axial, mu);
            tmp -= Cshift(tmp, mu, -1);
            axial_defect += trace(g5*tmp);
        }
        // Get <P|J5q> for 5D (zero for 4D) and <P|P>.
        PJ5q = zero;
        if (Ls_ > 1)
        {
            // <P|P>
            ExtractSlice(tmp, q, 0, 0);
            psi  = 0.5 * (tmp - g5*tmp);
            ExtractSlice(tmp, q, Ls_ - 1, 0);
            psi += 0.5 * (tmp + g5*tmp);
            PP = trace(adj(psi)*psi);
            // <P|5Jq>
            ExtractSlice(tmp, q, Ls_/2 - 1, 0);
            psi  = 0.5 * (tmp + g5*tmp);
            ExtractSlice(tmp, q, Ls_/2, 0);
            psi += 0.5 * (tmp - g5*tmp);
            PJ5q = trace(adj(psi)*psi);
        }
        else
        {
            PP = trace(adj(q)*q);
        }
        // Test ward identity <P|D_mu A_mu> = 2m<P|P> + 2<P|J5q>
        LOG(Message) << "|D_mu A_mu|^2 = " << norm2(axial_defect) << std::endl;
        LOG(Message) << "|PP|^2        = " << norm2(PP) << std::endl;
        LOG(Message) << "|PJ5q|^2      = " << norm2(PJ5q) << std::endl;
        LOG(Message) << "Axial Ward Identity defect Delta_mu A_mu = "
                     << norm2(axial_defect) << std::endl;
        // Axial defect by timeslice.
        axial_defect -= 2.*(par().mass*PP + PJ5q);
        LOG(Message) << "Check Axial defect by timeslice" << std::endl;
        sliceSum(axial_defect, axial_buf, Tp);
        for (int t = 0; t < axial_buf.size(); ++t)
        {
            LOG(Message) << "t = " << t << ": " 
                         << TensorRemove(axial_buf[t]) << std::endl;
        }
    }
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WardIdentity_hpp_
--- a/Hadrons/DilutedNoise.hpp
+++ b/Hadrons/DilutedNoise.hpp
@@ -1,250 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/DilutedNoise.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_DilutedNoise_hpp_
 #define Hadrons_DilutedNoise_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                   Abstract container for diluted noise                     *
 ******************************************************************************/
 template <typename FImpl>
 class DilutedNoise
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    DilutedNoise(GridCartesian *g);
    DilutedNoise(GridCartesian *g, const unsigned int nNoise);
    virtual ~DilutedNoise(void) = default;
    // access
    std::vector<FermionField> &       getNoise(void);
    const std::vector<FermionField> & getNoise(void) const;
    const FermionField &              operator[](const unsigned int i) const;
    FermionField &                    operator[](const unsigned int i);
    void                              resize(const unsigned int nNoise);
    unsigned int                      size(void) const;
    GridCartesian                     *getGrid(void) const;
    // generate noise (pure virtual)
    virtual void generateNoise(GridParallelRNG &rng) = 0;
 private:
    std::vector<FermionField> noise_;
    GridCartesian             *grid_;
    unsigned int              nNoise_;
 };
 template <typename FImpl>
 class TimeDilutedSpinColorDiagonalNoise: public DilutedNoise<FImpl>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    TimeDilutedSpinColorDiagonalNoise(GridCartesian *g);
    virtual ~TimeDilutedSpinColorDiagonalNoise(void) = default;
    // generate noise
    virtual void generateNoise(GridParallelRNG &rng);
 private:
    unsigned int nt_;
 };
 template <typename FImpl>
 class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    // constructor/destructor
    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
    // generate noise
    virtual void generateNoise(GridParallelRNG &rng);
 private:
    unsigned int nSrc_;
 };
 /******************************************************************************
 *                    DilutedNoise template implementation                    *
 ******************************************************************************/
 template <typename FImpl>
 DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g)
 : grid_(g)
 {}
 template <typename FImpl>
 DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g,
                                  const unsigned int nNoise)
 : DilutedNoise(g)
 {
    resize(nNoise);
 }
 template <typename FImpl>
 std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
 getNoise(void)
 {
    return noise_;
 }
 template <typename FImpl>
 const std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
 getNoise(void) const
 {
    return noise_;
 }
 template <typename FImpl>
 const typename DilutedNoise<FImpl>::FermionField & 
 DilutedNoise<FImpl>::operator[](const unsigned int i) const
 {
    return noise_[i];
 }
 template <typename FImpl>
 typename DilutedNoise<FImpl>::FermionField & 
 DilutedNoise<FImpl>::operator[](const unsigned int i)
 {
    return noise_[i];
 }
 template <typename FImpl>
 void DilutedNoise<FImpl>::resize(const unsigned int nNoise)
 {
    nNoise_ = nNoise;
    noise_.resize(nNoise, grid_);
 }
 template <typename FImpl>
 unsigned int DilutedNoise<FImpl>::size(void) const
 {  
    return noise_.size();
 }
 template <typename FImpl>
 GridCartesian * DilutedNoise<FImpl>::getGrid(void) const
 {
    return grid_;
 }
 /******************************************************************************
 *        TimeDilutedSpinColorDiagonalNoise template implementation           *
 ******************************************************************************/
 template <typename FImpl>
 TimeDilutedSpinColorDiagonalNoise<FImpl>::
 TimeDilutedSpinColorDiagonalNoise(GridCartesian *g)
 : DilutedNoise<FImpl>(g)
 {
    nt_ = this->getGrid()->GlobalDimensions().back();
    this->resize(nt_*Ns*FImpl::Dimension);
 }
 template <typename FImpl>
 void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
 {
    typedef decltype(peekColour((*this)[0], 0)) SpinField;
    auto                       &noise = *this;
    auto                       g      = this->getGrid();
    auto                       nd     = g->GlobalDimensions().size();
    auto                       nc     = FImpl::Dimension;
    Complex                    shift(1., 1.);
    Lattice<iScalar<vInteger>> tLat(g);
    LatticeComplex             eta(g), etaCut(g);
    SpinField                  etas(g);
    unsigned int               i = 0;
    LatticeCoordinate(tLat, nd - 1);
    bernoulli(rng, eta);
    eta = (2.*eta - shift)*(1./::sqrt(2.));
    for (unsigned int t = 0; t < nt_; ++t)
    {
        etaCut = where((tLat == t), eta, 0.*eta);
        for (unsigned int s = 0; s < Ns; ++s)
        {
            etas = zero;
            pokeSpin(etas, etaCut, s);
            for (unsigned int c = 0; c < nc; ++c)
            {
                noise[i] = zero;
                pokeColour(noise[i], etas, c);
                i++;
            }
        }
    }
 }
 /******************************************************************************
 *        FullVolumeSpinColorDiagonalNoise template implementation           *
 ******************************************************************************/
 template <typename FImpl>
 FullVolumeSpinColorDiagonalNoise<FImpl>::
 FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
 : DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
 {}
 template <typename FImpl>
 void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
 {
    typedef decltype(peekColour((*this)[0], 0)) SpinField;
    auto                       &noise = *this;
    auto                       g      = this->getGrid();
    auto                       nd     = g->GlobalDimensions().size();
    auto                       nc     = FImpl::Dimension;
    Complex                    shift(1., 1.);
    LatticeComplex             eta(g);
    SpinField                  etas(g);
    unsigned int               i = 0;
    bernoulli(rng, eta);
    eta = (2.*eta - shift)*(1./::sqrt(2.));
    for (unsigned int n = 0; n < nSrc_; ++n)
    {
        for (unsigned int s = 0; s < Ns; ++s)
        {
            etas = zero;
            pokeSpin(etas, eta, s);
            for (unsigned int c = 0; c < nc; ++c)
            {
                noise[i] = zero;
                pokeColour(noise[i], etas, c);
                i++;
            }
        }
    }
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_DilutedNoise_hpp_
--- a/Hadrons/DiskVector.hpp
+++ b/Hadrons/DiskVector.hpp
@@ -1,456 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/DiskVector.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_DiskVector_hpp_
 #define Hadrons_DiskVector_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <deque>
 #include <sys/stat.h>
 #include <ftw.h>
 #include <unistd.h>
 #ifdef DV_DEBUG
 #define DV_DEBUG_MSG(dv, stream) LOG(Debug) << "diskvector " << (dv) << ": " << stream << std::endl
 #else
 #define DV_DEBUG_MSG(dv, stream)
 #endif
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                           Abstract base class                              *
 ******************************************************************************/
 template <typename T>
 class DiskVectorBase
 {
 public:
    typedef T ObjectType;
    // helper for read/write vector access
    class RwAccessHelper
    {
    public:
        RwAccessHelper(DiskVectorBase<T> &master, const unsigned int i)
        : master_(master), cmaster_(master), i_(i) {}
        // operator=: somebody is trying to store a vector element
        // write to cache and tag as modified
        T &operator=(const T &obj) const
        {
            auto &cache    = *master_.cachePtr_;
            auto &modified = *master_.modifiedPtr_;
            auto &index    = *master_.indexPtr_;
            DV_DEBUG_MSG(&master_, "writing to " << i_);
            master_.cacheInsert(i_, obj);
            modified[index.at(i_)] = true;
            return cache[index.at(i_)];
        }
        // implicit cast to const object reference and redirection
        // to the const operator[] for read-only operations
        operator const T&() const
        {
            return cmaster_[i_];
        }
    private:
        DiskVectorBase<T>       &master_;
        const DiskVectorBase<T> &cmaster_;
        const unsigned int      i_;
    };
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
                   const unsigned int cacheSize = 1, const bool clean = true);
    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
    double hitRatio(void) const;
    void resetStat(void);
 private:
    virtual void load(T &obj, const std::string filename) const = 0;
    virtual void save(const std::string filename, const T &obj) const = 0;
    virtual std::string filename(const unsigned int i) const;
    void evict(void) const;
    void fetch(const unsigned int i) const;
    void cacheInsert(const unsigned int i, const T &obj) const;
    void clean(void);
 private:
    std::string                                           dirname_;
    unsigned int                                          size_, cacheSize_;
    double                                                access_{0.}, hit_{0.};
    bool                                                  clean_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
    std::unique_ptr<std::vector<T>>                       cachePtr_;
    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
 };
 /******************************************************************************
 *                   Specialisation for serialisable classes                  *
 ******************************************************************************/
 template <typename T, typename Reader, typename Writer>
 class SerializableDiskVector: public DiskVectorBase<T>
 {
 public:
    using DiskVectorBase<T>::DiskVectorBase;
 private:
    virtual void load(T &obj, const std::string filename) const
    {
        Reader reader(filename);
        read(reader, basename(filename), obj);
    }
    virtual void save(const std::string filename, const T &obj) const
    {
        Writer writer(filename);
        write(writer, basename(filename), obj);
    }
 };
 /******************************************************************************
 *                      Specialisation for Eigen matrices                     *
 ******************************************************************************/
 template <typename T>
 using EigenDiskVectorMat = A2AMatrix<T>;
 template <typename T>
 class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
 {
 public:
    using DiskVectorBase<EigenDiskVectorMat<T>>::DiskVectorBase;
    typedef EigenDiskVectorMat<T> Matrix;
 public:
    T operator()(const unsigned int i, const Eigen::Index j,
                 const Eigen::Index k) const
    {
        return (*this)[i](j, k);
    }
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
        std::ifstream f(filename, std::ios::binary);
        uint32_t      crc, check;
        Eigen::Index  nRow, nCol;
        size_t        matSize;
        double        tRead, tHash;
        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        obj.resize(nRow, nCol);
        matSize = nRow*nCol*sizeof(T);
        tRead  = -usecond();
        f.read(reinterpret_cast<char *>(obj.data()), matSize);
        tRead += usecond();
        tHash  = -usecond();
 #ifdef USE_IPP
        check  = GridChecksum::crc32c(obj.data(), matSize);
 #else
        check  = GridChecksum::crc32(obj.data(), matSize);
 #endif
        tHash += usecond();
        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
        if (crc != check)
        {
            HADRONS_ERROR(Io, "checksum failed")
        }
    }
    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
        std::ofstream f(filename, std::ios::binary);
        uint32_t      crc;
        Eigen::Index  nRow, nCol;
        size_t        matSize;
        double        tWrite, tHash;
        nRow    = obj.rows();
        nCol    = obj.cols();
        matSize = nRow*nCol*sizeof(T);
        tHash   = -usecond();
 #ifdef USE_IPP
        crc     = GridChecksum::crc32c(obj.data(), matSize);
 #else
        crc     = GridChecksum::crc32(obj.data(), matSize);
 #endif
        tHash  += usecond();
        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
        tWrite = -usecond();
        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
        tWrite += usecond();
        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
    }
 };
 /******************************************************************************
 *                       DiskVectorBase implementation                         *
 ******************************************************************************/
 template <typename T>
 DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
                                  const unsigned int size,
                                  const unsigned int cacheSize,
                                  const bool clean)
 : dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
 , cachePtr_(new std::vector<T>(size))
 , modifiedPtr_(new std::vector<bool>(size, false))
 , indexPtr_(new std::map<unsigned int, unsigned int>())
 , freePtr_(new std::stack<unsigned int>)
 , loadsPtr_(new std::deque<unsigned int>())
 {
    struct stat s;
    if(stat(dirname.c_str(), &s) == 0)
    {
        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
    }
    mkdir(dirname);
    for (unsigned int i = 0; i < cacheSize_; ++i)
    {
        freePtr_->push(i);
    }
 }
 template <typename T>
 DiskVectorBase<T>::~DiskVectorBase(void)
 {
    if (clean_)
    {
        clean();
    }
 }
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
    auto &cache   = *cachePtr_;
    auto &index   = *indexPtr_;
    auto &freeInd = *freePtr_;
    auto &loads   = *loadsPtr_;
    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");
    if (i >= size_)
    {
        HADRONS_ERROR(Size, "index out of range");
    }
    const_cast<double &>(access_)++;
    if (index.find(i) == index.end())
    {
        // cache miss
        DV_DEBUG_MSG(this, "cache miss");
        fetch(i);
    }
    else
    {
        DV_DEBUG_MSG(this, "cache hit");
        auto pos = std::find(loads.begin(), loads.end(), i);
        const_cast<double &>(hit_)++;
        loads.erase(pos);
        loads.push_back(i);
    }
 #ifdef DV_DEBUG
    std::string msg;
    for (auto &p: loads)
    {
        msg += std::to_string(p) + " ";
    }
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
    return cache[index.at(i)];
 }
 template <typename T>
 typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const unsigned int i)
 {
    DV_DEBUG_MSG(this, "accessing " << i << " (RW)");
    if (i >= size_)
    {
        HADRONS_ERROR(Size, "index out of range");
    }
    return RwAccessHelper(*this, i);
 }
 template <typename T>
 double DiskVectorBase<T>::hitRatio(void) const
 {
    return hit_/access_;
 }
 template <typename T>
 void DiskVectorBase<T>::resetStat(void)
 {
    access_ = 0.;
    hit_    = 0.;
 }
 template <typename T>
 std::string DiskVectorBase<T>::filename(const unsigned int i) const
 {
    return dirname_ + "/elem_" + std::to_string(i);
 }
 template <typename T>
 void DiskVectorBase<T>::evict(void) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    if (index.size() >= cacheSize_)
    {
        unsigned int i = loads.front();
        DV_DEBUG_MSG(this, "evicting " << i);
        if (modified[index.at(i)])
        {
            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
            save(filename(i), cache[index.at(i)]);
        }
        freeInd.push(index.at(i));
        index.erase(i);
        loads.pop_front();
    }
 }
 template <typename T>
 void DiskVectorBase<T>::fetch(const unsigned int i) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    struct stat s;
    DV_DEBUG_MSG(this, "loading " << i << " from disk");
    evict();
    if(stat(filename(i).c_str(), &s) != 0)
    {
        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
    }
    index[i] = freeInd.top();
    freeInd.pop();
    load(cache[index.at(i)], filename(i));
    loads.push_back(i);
    modified[index.at(i)] = false;
 }
 template <typename T>
 void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 {
    auto &cache    = *cachePtr_;
    auto &modified = *modifiedPtr_;
    auto &index    = *indexPtr_;
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;
    // cache miss, evict and store
    if (index.find(i) == index.end())
    {
        evict();
        index[i] = freeInd.top();
        freeInd.pop();
        cache[index.at(i)] = obj;
        loads.push_back(i);
        modified[index.at(i)] = false;
    }
    // cache hit, modify current value
    else
    {
        auto pos = std::find(loads.begin(), loads.end(), i);
        cache[index.at(i)]    = obj;
        modified[index.at(i)] = true;
        loads.erase(pos);
        loads.push_back(i);
    }
 #ifdef DV_DEBUG
    std::string msg;
    for (auto &p: loads)
    {
        msg += std::to_string(p) + " ";
    }
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
 }
 #ifdef DV_DEBUG
 #undef DV_DEBUG_MSG
 #endif
 template <typename T>
 void DiskVectorBase<T>::clean(void)
 {
    auto unlink = [](const char *fpath, const struct stat *sb, 
                     int typeflag, struct FTW *ftwbuf)
    {
        int rv = remove(fpath);
        if (rv)
        {
            HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': "
                          + std::string(std::strerror(errno)));
        }
        return rv;
    };
    nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_DiskVector_hpp_
--- a/Hadrons/EigenPack.hpp
+++ b/Hadrons/EigenPack.hpp
@@ -1,416 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/EigenPack.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_EigenPack_hpp_
 #define Hadrons_EigenPack_hpp_
 #include <Hadrons/Global.hpp>
 #include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
 BEGIN_HADRONS_NAMESPACE
 // Lanczos type
 #ifndef HADRONS_DEFAULT_LANCZOS_NBASIS
 #define HADRONS_DEFAULT_LANCZOS_NBASIS 60
 #endif
 #define HADRONS_DUMP_EP_METADATA(record) \
 LOG(Message) << "Eigenpack metadata:" << std::endl;\
 LOG(Message) << "* operator" << std::endl;\
 LOG(Message) << (record).operatorXml << std::endl;\
 LOG(Message) << "* solver" << std::endl;\
 LOG(Message) << (record).solverXml << std::endl;
 struct PackRecord
 {
    std::string operatorXml, solverXml;
 };
 struct VecRecord: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
                                    unsigned int, index,
                                    double,       eval);
    VecRecord(void): index(0), eval(0.) {}
 };
 namespace EigenPackIo
 {
    inline void readHeader(PackRecord &record, ScidacReader &binReader)
    {
        std::string recordXml;
        binReader.readLimeObject(recordXml, SCIDAC_FILE_XML);
        XmlReader xmlReader(recordXml, true, "eigenPackPar");
        xmlReader.push();
        xmlReader.readCurrentSubtree(record.operatorXml);
        xmlReader.nextElement();
        xmlReader.readCurrentSubtree(record.solverXml);
    }
    template <typename T, typename TIo = T>
    void readElement(T &evec, RealD &eval, const unsigned int index,
                     ScidacReader &binReader, TIo *ioBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Reading eigenvector " << index << std::endl;
        if (ioBuf == nullptr)
        {
            binReader.readScidacFieldRecord(evec, vecRecord);
        }
        else
        {
            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
            precisionChange(evec, *ioBuf);
        }
        if (vecRecord.index != index)
        {
            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
                            + " wrong index (expected " + std::to_string(vecRecord.index) 
                            + ")");
        }
        eval = vecRecord.eval;
    }
    template <typename T, typename TIo = T>
    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
                         PackRecord &record, const std::string filename, 
                         const unsigned int size, bool multiFile, 
                         GridBase *gridIo = nullptr)
    {
        std::unique_ptr<TIo> ioBuf{nullptr};
        ScidacReader         binReader;
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                binReader.open(fullFilename);
                readHeader(record, binReader);
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
                binReader.close();
            }
        }
        else
        {
            binReader.open(filename);
            readHeader(record, binReader);
            for(int k = 0; k < size; ++k) 
            {
                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
            }
            binReader.close();
        }
    }
    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
    {
        XmlWriter xmlWriter("", "eigenPackPar");
        xmlWriter.pushXmlString(record.operatorXml);
        xmlWriter.pushXmlString(record.solverXml);
        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
    }
    template <typename T, typename TIo = T>
    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
                      const unsigned int index, TIo *ioBuf, 
                      T *testBuf = nullptr)
    {
        VecRecord vecRecord;
        LOG(Message) << "Writing eigenvector " << index << std::endl;
        vecRecord.eval  = eval;
        vecRecord.index = index;
        if ((ioBuf == nullptr) || (testBuf == nullptr))
        {
            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
        }
        else
        {
            precisionChange(*ioBuf, evec);
            precisionChange(*testBuf, *ioBuf);
            *testBuf -= evec;
            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
        }   
    }
    template <typename T, typename TIo = T>
    static void writePack(const std::string filename, std::vector<T> &evec, 
                          std::vector<RealD> &eval, PackRecord &record, 
                          const unsigned int size, bool multiFile, 
                          GridBase *gridIo = nullptr)
    {
        GridBase             *grid = evec[0]._grid;
        std::unique_ptr<TIo> ioBuf{nullptr}; 
        std::unique_ptr<T>   testBuf{nullptr};
        ScidacWriter         binWriter(grid->IsBoss());
        if (typeHash<T>() != typeHash<TIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
            ioBuf.reset(new TIo(gridIo));
            testBuf.reset(new T(grid));
        }
        if (multiFile)
        {
            std::string fullFilename;
            for(int k = 0; k < size; ++k) 
            {
                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
                makeFileDir(fullFilename, grid);
                binWriter.open(fullFilename);
                writeHeader(binWriter, record);
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
                binWriter.close();
            }
        }
        else
        {
            makeFileDir(filename, grid);
            binWriter.open(filename);
            writeHeader(binWriter, record);
            for(int k = 0; k < size; ++k) 
            {
                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
            }
            binWriter.close();
        }
    }
 }
 template <typename F>
 class BaseEigenPack
 {
 public:
    typedef F Field;
 public:
    std::vector<RealD> eval;
    std::vector<F>     evec;
    PackRecord         record;
 public:
    BaseEigenPack(void)          = default;
    BaseEigenPack(const size_t size, GridBase *grid)
    {
        resize(size, grid);
    }
    virtual ~BaseEigenPack(void) = default;
    void resize(const size_t size, GridBase *grid)
    {
        eval.resize(size);
        evec.resize(size, grid);
    }
 };
 template <typename F, typename FIo = F>
 class EigenPack: public BaseEigenPack<F>
 {
 public:
    typedef F   Field;
    typedef FIo FieldIo;
 public:
    EigenPack(void)          = default;
    virtual ~EigenPack(void) = default;
    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
    : BaseEigenPack<F>(size, grid)
    {
        if (typeHash<F>() != typeHash<FIo>())
        {
            if (gridIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "I/O type different from vector type but null I/O grid passed");
            }
        }
        gridIo_ = gridIo;
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
                                      evecFilename(fileStem, traj, multiFile), 
                                      this->evec.size(), multiFile, gridIo_);
        HADRONS_DUMP_EP_METADATA(this->record);
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
                                       this->evec, this->eval, this->record, 
                                       this->evec.size(), multiFile, gridIo_);
    }
 protected:
    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
    {
        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
        if (multiFile)
        {
            return stem + t;
        }
        else
        {
            return stem + t + ".bin";
        }
    }
 protected:
    GridBase *gridIo_;
 };
 template <typename FineF, typename CoarseF, 
          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
 class CoarseEigenPack: public EigenPack<FineF, FineFIo>
 {
 public:
    typedef CoarseF   CoarseField;
    typedef CoarseFIo CoarseFieldIo;
 public:      
    std::vector<CoarseF> evecCoarse;
    std::vector<RealD>   evalCoarse;
 public:
    CoarseEigenPack(void)          = default;
    virtual ~CoarseEigenPack(void) = default;
    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
                    GridBase *gridFine, GridBase *gridCoarse,
                    GridBase *gridFineIo = nullptr, 
                    GridBase *gridCoarseIo = nullptr)
    {
        if (typeHash<FineF>() != typeHash<FineFIo>())
        {
            if (gridFineIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Fine I/O type different from vector type but null fine I/O grid passed");
            }
        }
        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
        {
            if (gridCoarseIo == nullptr)
            {
                HADRONS_ERROR(Definition, 
                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
            }
        }
        this->gridIo_ = gridFineIo;
        gridCoarseIo_ = gridCoarseIo;
        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
    }
    void resize(const size_t sizeFine, const size_t sizeCoarse, 
                GridBase *gridFine, GridBase *gridCoarse)
    {
        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
        evalCoarse.resize(sizeCoarse);
        evecCoarse.resize(sizeCoarse, gridCoarse);
    }
    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
    }
    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        PackRecord dummy;
        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
                              evecCoarse.size(), multiFile, gridCoarseIo_);
    }
    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        readFine(fileStem, multiFile, traj);
        readCoarse(fileStem, multiFile, traj);
    }
    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
    }
    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
                                                   evecCoarse, evalCoarse, this->record, 
                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
    }
    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
    {
        writeFine(fileStem, multiFile, traj);
        writeCoarse(fileStem, multiFile, traj);
    }
 private:
    GridBase *gridCoarseIo_;
 };
 template <typename FImpl>
 using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;
 template <typename FImpl, typename FImplIo = FImpl>
 using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
 template <typename FImpl, int nBasis, typename FImplIo = FImpl>
 using CoarseFermionEigenPack = CoarseEigenPack<
    typename FImpl::FermionField,
    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
                                   typename FImpl::SiteComplex, 
                                   nBasis>::CoarseField,
    typename FImplIo::FermionField,
    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
                                   typename FImplIo::SiteComplex, 
                                   nBasis>::CoarseField>;
 #undef HADRONS_DUMP_EP_METADATA
 END_HADRONS_NAMESPACE
 #endif // Hadrons_EigenPack_hpp_
--- a/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -1,337 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Environment.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Environment.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 #define ERROR_NO_ADDRESS(address)\
 HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + std::to_string(address), address);
 /******************************************************************************
 *                       Environment implementation                           *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
    dim_ = GridDefaultLatt();
    nd_  = dim_.size();
    vol_ = 1.;
    for (auto d: dim_)
    {
        vol_ *= d;
    }
 }
 // grids ///////////////////////////////////////////////////////////////////////
 unsigned int Environment::getNd(void) const
 {
    return nd_;
 }
 std::vector<int> Environment::getDim(void) const
 {
    return dim_;
 }
 int Environment::getDim(const unsigned int mu) const
 {
    return dim_[mu];
 }
 double Environment::getVolume(void) const
 {
    return vol_;
 }
 // random number generator /////////////////////////////////////////////////////
 GridParallelRNG * Environment::get4dRng(void)
 {
    if (rng4d_ == nullptr)
    {
        rng4d_.reset(new GridParallelRNG(getGrid()));
    }
    return rng4d_.get();
 }
 // general memory management ///////////////////////////////////////////////////
 void Environment::addObject(const std::string name, const int moduleAddress)
 {
    if (!hasObject(name))
    {
        ObjInfo info;
        info.name   = name;
        info.module = moduleAddress;
        info.data   = nullptr;
        object_.push_back(std::move(info));
        objectAddress_[name] = static_cast<unsigned int>(object_.size() - 1);
    }
    else
    {
        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already exists",
                          getObjectAddress(name));
    }
 }
 void Environment::setObjectModule(const unsigned int objAddress,
                                  const int modAddress)
 {
    object_[objAddress].module = modAddress;
 }
 unsigned int Environment::getMaxAddress(void) const
 {
    return object_.size();
 }
 unsigned int Environment::getObjectAddress(const std::string name) const
 {
    if (hasObject(name))
    {
        return objectAddress_.at(name);
    }
    else
    {
        HADRONS_ERROR(Definition, "no object with name '" + name + "'");
    }
 }
 std::string Environment::getObjectName(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].name;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 std::string Environment::getObjectType(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (object_[address].type)
        {
            return typeName(object_[address].type);
        }
        else
        {
            return "<no type>";
        }
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 std::string Environment::getObjectType(const std::string name) const
 {
    return getObjectType(getObjectAddress(name));
 }
 Environment::Size Environment::getObjectSize(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].size;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 Environment::Size Environment::getObjectSize(const std::string name) const
 {
    return getObjectSize(getObjectAddress(name));
 }
 Environment::Storage Environment::getObjectStorage(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].storage;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 Environment::Storage Environment::getObjectStorage(const std::string name) const
 {
    return getObjectStorage(getObjectAddress(name));
 }
 int Environment::getObjectModule(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return object_[address].module;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 int Environment::getObjectModule(const std::string name) const
 {
    return getObjectModule(getObjectAddress(name));
 }
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
    if (hasCreatedObject(address))
    {
        return object_[address].Ls;
    }
    else
    {
        ERROR_NO_ADDRESS(address);
    }
 }
 unsigned int Environment::getObjectLs(const std::string name) const
 {
    return getObjectLs(getObjectAddress(name));
 }
 bool Environment::hasObject(const unsigned int address) const
 {
    return (address < object_.size());
 }
 bool Environment::hasObject(const std::string name) const
 {
    auto it = objectAddress_.find(name);
    return ((it != objectAddress_.end()) and hasObject(it->second));
 }
 bool Environment::hasCreatedObject(const unsigned int address) const
 {
    if (hasObject(address))
    {
        return (object_[address].data != nullptr);
    }
    else
    {
        return false;
    }
 }
 bool Environment::hasCreatedObject(const std::string name) const
 {
    if (hasObject(name))
    {
        return hasCreatedObject(getObjectAddress(name));
    }
    else
    {
        return false;
    }
 }
 bool Environment::isObject5d(const unsigned int address) const
 {
    return (getObjectLs(address) > 1);
 }
 bool Environment::isObject5d(const std::string name) const
 {
    return (getObjectLs(name) > 1);
 }
 Environment::Size Environment::getTotalSize(void) const
 {
    Environment::Size size = 0;
    for (auto &o: object_)
    {
        size += o.size;
    }
    return size;
 }
 void Environment::freeObject(const unsigned int address)
 {
    if (hasCreatedObject(address))
    {
        LOG(Message) << "Destroying object '" << object_[address].name
                     << "'" << std::endl;
    }
    object_[address].size = 0;
    object_[address].type = nullptr;
    object_[address].data.reset(nullptr);
 }
 void Environment::freeObject(const std::string name)
 {
    freeObject(getObjectAddress(name));
 }
 void Environment::freeAll(void)
 {
    for (unsigned int i = 0; i < object_.size(); ++i)
    {
        freeObject(i);
    }
 }
 void Environment::protectObjects(const bool protect)
 {
    protect_ = protect;
 }
 bool Environment::objectsProtected(void) const
 {
    return protect_;
 }
 // print environment content ///////////////////////////////////////////////////
 void Environment::printContent(void) const
 {
    LOG(Debug) << "Objects: " << std::endl;
    for (unsigned int i = 0; i < object_.size(); ++i)
    {
        LOG(Debug) << std::setw(4) << i << ": "
                   << getObjectName(i) << " ("
                   << sizeString(getObjectSize(i)) << ")" << std::endl;
    }
 }
--- a/Hadrons/Environment.hpp
+++ b/Hadrons/Environment.hpp
@@ -1,585 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Environment.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Environment_hpp_
 #define Hadrons_Environment_hpp_
 #include <Hadrons/Global.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Global environment                                 *
 ******************************************************************************/
 class Object
 {
 public:
    Object(void) = default;
    virtual ~Object(void) = default;
 };
 template <typename T>
 class Holder: public Object
 {
 public:
    Holder(void) = default;
    Holder(T *pt);
    virtual ~Holder(void) = default;
    T &       get(void) const;
    T *       getPt(void) const;
    void      reset(T *pt);
 private:
    std::unique_ptr<T> objPt_{nullptr};
 };
 #define DEFINE_ENV_ALIAS \
 inline Environment & env(void) const\
 {\
    return Environment::getInstance();\
 }
 #define DEFINE_ENV_LAMBDA \
 auto env = [](void)->Environment &{return Environment::getInstance();}
 class Environment
 {
    SINGLETON(Environment);
 public:
    typedef SITE_SIZE_TYPE                         Size;
    typedef std::unique_ptr<GridCartesian>         GridPt;
    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
    typedef std::unique_ptr<GridParallelRNG>       RngPt;
    enum class Storage {object, cache, temporary};
 private:
    struct ObjInfo
    {
        Size                    size{0};
        Storage                 storage{Storage::object};
        unsigned int            Ls{0};
        const std::type_info    *type{nullptr}, *derivedType{nullptr};
        std::string             name;
        int                     module{-1};
        std::unique_ptr<Object> data{nullptr};
    };
    typedef std::pair<size_t, unsigned int>     FineGridKey;
    typedef std::pair<size_t, std::vector<int>> CoarseGridKey;
 public:
    // grids
    template <typename VType = vComplex>
    void                    createGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    void                    createCoarseGrid(const std::vector<int> &blockSize,
                                             const unsigned int Ls);
    template <typename VType = vComplex>
    GridCartesian *         getGrid(void);
    template <typename VType = vComplex>
    GridRedBlackCartesian * getRbGrid(void);
    template <typename VType = vComplex>
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize);
    template <typename VType = vComplex>
    GridCartesian *         getGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls);
    template <typename VType = vComplex>
    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
                                          const unsigned int Ls);
    std::vector<int>        getDim(void) const;
    int                     getDim(const unsigned int mu) const;
    unsigned int            getNd(void) const;
    double                  getVolume(void) const;
    // random number generator
    GridParallelRNG *       get4dRng(void);
    // general memory management
    void                    addObject(const std::string name,
                                      const int moduleAddress = -1);
    template <typename B, typename T, typename ... Ts>
    void                    createDerivedObject(const std::string name,
                                                const Environment::Storage storage,
                                                const unsigned int Ls,
                                                Ts && ... args);
    template <typename T, typename ... Ts>
    void                    createObject(const std::string name,
                                         const Environment::Storage storage,
                                         const unsigned int Ls,
                                         Ts && ... args);
    void                    setObjectModule(const unsigned int objAddress,
                                            const int modAddress);
    template <typename B, typename T>
    T *                     getDerivedObject(const unsigned int address) const;
    template <typename B, typename T>
    T *                     getDerivedObject(const std::string name) const;
    template <typename T>
    T *                     getObject(const unsigned int address) const;
    template <typename T>
    T *                     getObject(const std::string name) const;
    unsigned int            getMaxAddress(void) const;
    unsigned int            getObjectAddress(const std::string name) const;
    std::string             getObjectName(const unsigned int address) const;
    std::string             getObjectType(const unsigned int address) const;
    std::string             getObjectType(const std::string name) const;
    Size                    getObjectSize(const unsigned int address) const;
    Size                    getObjectSize(const std::string name) const;
    Storage                 getObjectStorage(const unsigned int address) const;
    Storage                 getObjectStorage(const std::string name) const;
    int                     getObjectModule(const unsigned int address) const;
    int                     getObjectModule(const std::string name) const;
    unsigned int            getObjectLs(const unsigned int address) const;
    unsigned int            getObjectLs(const std::string name) const;
    bool                    hasObject(const unsigned int address) const;
    bool                    hasObject(const std::string name) const;
    bool                    hasCreatedObject(const unsigned int address) const;
    bool                    hasCreatedObject(const std::string name) const;
    bool                    isObject5d(const unsigned int address) const;
    bool                    isObject5d(const std::string name) const;
    template <typename T>
    bool                    isObjectOfType(const unsigned int address) const;
    template <typename T>
    bool                    isObjectOfType(const std::string name) const;
    Environment::Size       getTotalSize(void) const;
    void                    freeObject(const unsigned int address);
    void                    freeObject(const std::string name);
    void                    freeAll(void);
    void                    protectObjects(const bool protect);
    bool                    objectsProtected(void) const;
    // print environment content
    void                    printContent(void) const;
 private:
    // general
    double                              vol_;
    bool                                protect_{true};
    // grids
    std::vector<int>                    dim_;
    std::map<FineGridKey, GridPt>       grid4d_;
    std::map<FineGridKey, GridPt>       grid5d_;
    std::map<FineGridKey, GridRbPt>     gridRb4d_;
    std::map<FineGridKey, GridRbPt>     gridRb5d_;
    std::map<CoarseGridKey, GridPt>     gridCoarse4d_;
    std::map<CoarseGridKey, GridPt>     gridCoarse5d_;
    unsigned int                        nd_;
    // random number generator
    RngPt                               rng4d_{nullptr};
    // object store
    std::vector<ObjInfo>                object_;
    std::map<std::string, unsigned int> objectAddress_;
 };
 /******************************************************************************
 *                       Holder template implementation                       *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename T>
 Holder<T>::Holder(T *pt)
 : objPt_(pt)
 {}
 // access //////////////////////////////////////////////////////////////////////
 template <typename T>
 T & Holder<T>::get(void) const
 {
    return *objPt_.get();
 }
 template <typename T>
 T * Holder<T>::getPt(void) const
 {
    return objPt_.get();
 }
 template <typename T>
 void Holder<T>::reset(T *pt)
 {
    objPt_.reset(pt);
 }
 /******************************************************************************
 *                     Environment template implementation                    *
 ******************************************************************************/
 // grids ///////////////////////////////////////////////////////////////////////
 #define HADRONS_DUMP_GRID(...)\
 LOG(Debug) << "New grid " << (__VA_ARGS__) << std::endl;\
 LOG(Debug) << " - cb  : " << (__VA_ARGS__)->_isCheckerBoarded << std::endl;\
 LOG(Debug) << " - fdim: " << (__VA_ARGS__)->_fdimensions << std::endl;\
 LOG(Debug) << " - gdim: " << (__VA_ARGS__)->_gdimensions << std::endl;\
 LOG(Debug) << " - ldim: " << (__VA_ARGS__)->_ldimensions << std::endl;\
 LOG(Debug) << " - rdim: " << (__VA_ARGS__)->_rdimensions << std::endl;
 template <typename VType>
 void Environment::createGrid(const unsigned int Ls)
 {
    size_t hash = typeHash<VType>();
    if (grid4d_.find({hash, 1}) == grid4d_.end())
    {
        grid4d_[{hash, 1}].reset(
            SpaceTimeGrid::makeFourDimGrid(getDim(), 
                                        GridDefaultSimd(getNd(), VType::Nsimd()),
                                        GridDefaultMpi()));
        HADRONS_DUMP_GRID(grid4d_[{hash, 1}].get());
        gridRb4d_[{hash, 1}].reset(
            SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[{hash, 1}].get()));
        HADRONS_DUMP_GRID(gridRb4d_[{hash, 1}].get());
    }
    if (grid5d_.find({hash, Ls}) == grid5d_.end())
    {
        auto g = grid4d_[{hash, 1}].get();
        grid5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
        HADRONS_DUMP_GRID(grid5d_[{hash, Ls}].get());
        gridRb5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
        HADRONS_DUMP_GRID(gridRb5d_[{hash, Ls}].get());
    }
 }
 template <typename VType>
 void Environment::createCoarseGrid(const std::vector<int> &blockSize,
                                   const unsigned int Ls)
 {
    int              nd      = getNd();
    std::vector<int> fineDim = getDim(), coarseDim(nd);
    unsigned int     cLs;
    auto             key4d = blockSize, key5d = blockSize;
    size_t           hash  = typeHash<VType>();
    createGrid(Ls);
    for (int d = 0; d < coarseDim.size(); d++)
    {
        coarseDim[d] = fineDim[d]/blockSize[d];
        if (coarseDim[d]*blockSize[d] != fineDim[d])
        {
            HADRONS_ERROR(Size, "Fine dimension " + std::to_string(d) 
                         + " (" + std::to_string(fineDim[d]) 
                         + ") not divisible by coarse dimension ("
                         + std::to_string(coarseDim[d]) + ")"); 
        }
    }
    if (blockSize.size() > nd)
    {
        cLs = Ls/blockSize[nd];
        if (cLs*blockSize[nd] != Ls)
        {
            HADRONS_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
                         + ") not divisible by coarse Ls ("
                         + std::to_string(cLs) + ")");
        }
    }
    else
    {
        cLs = Ls;
    }
    key4d.resize(nd);
    key5d.push_back(Ls);
    CoarseGridKey hkey4d = {hash, key4d}, hkey5d = {hash, key5d};
    if (gridCoarse4d_.find(hkey4d) == gridCoarse4d_.end())
    {
        gridCoarse4d_[hkey4d].reset(
            SpaceTimeGrid::makeFourDimGrid(coarseDim, 
                GridDefaultSimd(nd, VType::Nsimd()), GridDefaultMpi()));
        HADRONS_DUMP_GRID(gridCoarse4d_[hkey4d].get());
    }
    if (gridCoarse5d_.find(hkey5d) == gridCoarse5d_.end())
    {
        gridCoarse5d_[hkey5d].reset(
            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[hkey4d].get()));
        HADRONS_DUMP_GRID(gridCoarse5d_[hkey5d].get());
    }
 }
 #undef HADRONS_DUMP_GRID
 template <typename VType>
 GridCartesian * Environment::getGrid(void)
 {
    FineGridKey key = {typeHash<VType>(), 1};
    auto it = grid4d_.find(key);
    if (it != grid4d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(1);
        return grid4d_.at(key).get();
    }
 }
 template <typename VType>
 GridRedBlackCartesian * Environment::getRbGrid(void)
 {
    FineGridKey key = {typeHash<VType>(), 1};
    auto        it  = gridRb4d_.find(key);
    if (it != gridRb4d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(1);
        return gridRb4d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize)
 {
    std::vector<int> s = blockSize;
    s.resize(getNd());
    CoarseGridKey key = {typeHash<VType>(), s};
    auto          it  = gridCoarse4d_.find(key);
    if (it != gridCoarse4d_.end())
    {
        return it->second.get();
    }
    else
    {
        createCoarseGrid<VType>(blockSize, 1);
        return gridCoarse4d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getGrid(const unsigned int Ls)
 {
    FineGridKey key = {typeHash<VType>(), Ls};
    auto        it  = grid5d_.find(key);
    if (it != grid5d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(Ls);
        return grid5d_.at(key).get();
    }
 }
 template <typename VType>
 GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls)
 {
    FineGridKey key = {typeHash<VType>(), Ls};
    auto        it  = gridRb5d_.find(key);
    if (it != gridRb5d_.end())
    {
        return it->second.get();
    }
    else
    {
        createGrid<VType>(Ls);
        return gridRb5d_.at(key).get();
    }
 }
 template <typename VType>
 GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize,
                                           const unsigned int Ls)
 {
    std::vector<int> s = blockSize;
    s.push_back(Ls);
    CoarseGridKey key = {typeHash<VType>(), s};
    auto it = gridCoarse5d_.find(key);
    if (it != gridCoarse5d_.end())
    {
        return it->second.get();
    }
    else
    {
        createCoarseGrid<VType>(blockSize, Ls);
        return gridCoarse5d_.at(key).get();
    }
 }
 // general memory management ///////////////////////////////////////////////////
 template <typename B, typename T, typename ... Ts>
 void Environment::createDerivedObject(const std::string name,
                                      const Environment::Storage storage,
                                      const unsigned int Ls,
                                      Ts && ... args)
 {
    if (!hasObject(name))
    {
        addObject(name);
    }
    unsigned int address = getObjectAddress(name);
    if (!object_[address].data or !objectsProtected())
    {
        MemoryStats memStats;
        if (!MemoryProfiler::stats)
        {
            MemoryProfiler::stats = &memStats;
        }
        size_t initMem               = MemoryProfiler::stats->currentlyAllocated;
        object_[address].storage     = storage;
        object_[address].Ls          = Ls;
        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
        object_[address].size        = MemoryProfiler::stats->maxAllocated - initMem;
        object_[address].type        = typeIdPt<B>();
        object_[address].derivedType = typeIdPt<T>();
        if (MemoryProfiler::stats == &memStats)
        {
            MemoryProfiler::stats = nullptr;
        }
    }
    // object already exists, no error if it is a cache, error otherwise
    else if ((object_[address].storage               != Storage::cache) or 
             (object_[address].storage               != storage)        or
             (object_[address].name                  != name)           or
             (typeHash(object_[address].type)        != typeHash<B>())  or
             (typeHash(object_[address].derivedType) != typeHash<T>()))
    {
        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already allocated", address);
    }
 }
 template <typename T, typename ... Ts>
 void Environment::createObject(const std::string name, 
                               const Environment::Storage storage,
                               const unsigned int Ls,
                               Ts && ... args)
 {
    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
 }
 template <typename B, typename T>
 T * Environment::getDerivedObject(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (hasCreatedObject(address))
        {
            if (auto h = dynamic_cast<Holder<B> *>(object_[address].data.get()))
            {
                if (&typeid(T) == &typeid(B))
                {
                    return dynamic_cast<T *>(h->getPt());
                }
                else
                {
                    if (auto hder = dynamic_cast<T *>(h->getPt()))
                    {
                        return hder;
                    }
                    else
                    {
                        HADRONS_ERROR_REF(ObjectType, "object with address " +
                            std::to_string(address) +
                            " cannot be casted to '" + typeName(&typeid(T)) +
                            "' (has type '" + typeName(&typeid(h->get())) + "')", address);
                    }
                }
            }
            else
            {
                HADRONS_ERROR_REF(ObjectType, "object with address " + 
                            std::to_string(address) +
                            " does not have type '" + typeName(&typeid(B)) +
                            "' (has type '" + getObjectType(address) + "')", address);
            }
        }
        else
        {
            HADRONS_ERROR_REF(ObjectDefinition, "object with address " + 
                              std::to_string(address) + " is empty", address);
        }
    }
    else
    {
        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + 
                          std::to_string(address), address);
    }
 }
 template <typename B, typename T>
 T * Environment::getDerivedObject(const std::string name) const
 {
    return getDerivedObject<B, T>(getObjectAddress(name));
 }
 template <typename T>
 T * Environment::getObject(const unsigned int address) const
 {
    return getDerivedObject<T, T>(address);
 }
 template <typename T>
 T * Environment::getObject(const std::string name) const
 {
    return getObject<T>(getObjectAddress(name));
 }
 template <typename T>
 bool Environment::isObjectOfType(const unsigned int address) const
 {
    if (hasObject(address))
    {
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else
    {
        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " 
                          + std::to_string(address), address);
    }
 }
 template <typename T>
 bool Environment::isObjectOfType(const std::string name) const
 {
    return isObjectOfType<T>(getObjectAddress(name));
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Environment_hpp_
--- a/Hadrons/Exceptions.cc
+++ b/Hadrons/Exceptions.cc
@@ -1,102 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Exceptions.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Exceptions.hpp>
 #include <Hadrons/VirtualMachine.hpp>
 #include <Hadrons/Module.hpp>
 #ifndef ERR_SUFF
 #define ERR_SUFF " (" + loc + ")"
 #endif
 #define CTOR_EXC(name, init) \
 name::name(std::string msg, std::string loc)\
 :init\
 {}
 #define CTOR_EXC_REF(name, init) \
 name::name(std::string msg, std::string loc, const unsigned int address)\
 :init\
 {}
 using namespace Grid;
 using namespace Hadrons;
 using namespace Exceptions;
 // backtrace cache
 std::vector<std::string> Grid::Hadrons::Exceptions::backtraceStr;
 // logic errors
 CTOR_EXC(Logic, logic_error(msg + ERR_SUFF))
 CTOR_EXC(Definition, Logic("definition error: " + msg, loc))
 CTOR_EXC(Implementation, Logic("implementation error: " + msg, loc))
 CTOR_EXC(Range, Logic("range error: " + msg, loc))
 CTOR_EXC(Size, Logic("size error: " + msg, loc))
 // runtime errors
 CTOR_EXC(Runtime, runtime_error(msg + ERR_SUFF))
 CTOR_EXC(Argument, Runtime("argument error: " + msg, loc))
 CTOR_EXC(Io, Runtime("IO error: " + msg, loc))
 CTOR_EXC(Memory, Runtime("memory error: " + msg, loc))
 CTOR_EXC(Parsing, Runtime("parsing error: " + msg, loc))
 CTOR_EXC(Program, Runtime("program error: " + msg, loc))
 CTOR_EXC(System, Runtime("system error: " + msg, loc))
 // virtual machine errors
 CTOR_EXC_REF(ObjectDefinition, RuntimeRef("object definition error: " + msg, loc, address));
 CTOR_EXC_REF(ObjectType, RuntimeRef("object type error: " + msg, loc, address));
 // abort functions
 void Grid::Hadrons::Exceptions::abort(const std::exception& e)
 {
    auto &vm = VirtualMachine::getInstance();
    int  mod = vm.getCurrentModule();
    LOG(Error) << "FATAL ERROR -- Exception " << typeName(&typeid(e)) 
               << std::endl;
    if (mod >= 0)
    {
        LOG(Error) << "During execution of module '"
                    << vm.getModuleName(mod) << "' (address " << mod << ")"
                    << std::endl;
    }
    LOG(Error) << e.what() << std::endl;
    if (!backtraceStr.empty())
    {
        LOG(Error) << "-- BACKTRACE --------------" << std::endl;
        for (auto &s: backtraceStr)
        {
            LOG(Error) << s << std::endl;
        }
        LOG(Error) << "---------------------------" << std::endl;
    }
    LOG(Error) << "Aborting program" << std::endl;
    Grid_finalize();
    exit(EXIT_FAILURE);
 }
--- a/Hadrons/Exceptions.hpp
+++ b/Hadrons/Exceptions.hpp
@@ -1,129 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Exceptions.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Exceptions_hpp_
 #define Hadrons_Exceptions_hpp_
 #include <stdexcept>
 #include <execinfo.h>
 #ifndef Hadrons_Global_hpp_
 #include <Hadrons/Global.hpp>
 #endif
 #define HADRONS_SRC_LOC std::string(__FUNCTION__) + " at " \
                        + std::string(__FILE__) + ":" + std::to_string(__LINE__)
 #define HADRONS_BACKTRACE_MAX 128
 #ifdef HAVE_EXECINFO_H
 #define HADRONS_CACHE_BACKTRACE \
 {\
    void* _callstack[HADRONS_BACKTRACE_MAX];\
    int _i, _frames = backtrace(_callstack, HADRONS_BACKTRACE_MAX);\
    char** _strs = backtrace_symbols(_callstack, _frames);\
    Grid::Hadrons::Exceptions::backtraceStr.clear();\
    for (_i = 0; _i < _frames; ++_i)\
    {\
        Hadrons::Exceptions::backtraceStr.push_back(std::string(_strs[_i]));\
    }\
    free(_strs);\
 }
 #else
 #define HADRONS_CACHE_BACKTRACE \
 Grid::Hadrons::Exceptions::backtraceStr.clear();\
 Grid::Hadrons::Exceptions::backtraceStr.push_back("<backtrace not supported>");
 #endif
 #define HADRONS_ERROR(exc, msg)\
 HADRONS_CACHE_BACKTRACE \
 throw(Exceptions::exc(msg, HADRONS_SRC_LOC));
 #define HADRONS_ERROR_REF(exc, msg, address)\
 HADRONS_CACHE_BACKTRACE \
 throw(Exceptions::exc(msg, HADRONS_SRC_LOC, address));
 #define DECL_EXC(name, base) \
 class name: public base\
 {\
 public:\
    name(std::string msg, std::string loc);\
 }
 #define DECL_EXC_REF(name, base) \
 class name: public base\
 {\
 public:\
    name(std::string msg, std::string loc, const unsigned int address);\
 }
 BEGIN_HADRONS_NAMESPACE
 namespace Exceptions
 {
    // backtrace cache
    extern std::vector<std::string> backtraceStr;
    // logic errors
    DECL_EXC(Logic, std::logic_error);
    DECL_EXC(Definition, Logic);
    DECL_EXC(Implementation, Logic);
    DECL_EXC(Range, Logic);
    DECL_EXC(Size, Logic);
    // runtime errors
    DECL_EXC(Runtime, std::runtime_error);
    DECL_EXC(Argument, Runtime);
    DECL_EXC(Io, Runtime);
    DECL_EXC(Memory, Runtime);
    DECL_EXC(Parsing, Runtime);
    DECL_EXC(Program, Runtime);
    DECL_EXC(System, Runtime);
    // virtual machine errors
    class RuntimeRef: public Runtime
    {
    public:
        RuntimeRef(std::string msg, std::string loc, const unsigned int address)
        : Runtime(msg, loc), address_(address)
        {}
        unsigned int getAddress(void) const
        {
            return address_;
        }
    private:
        unsigned int address_;
    };
    DECL_EXC_REF(ObjectDefinition, RuntimeRef);
    DECL_EXC_REF(ObjectType, RuntimeRef);
    // abort functions
    void abort(const std::exception& e);
 }
 END_HADRONS_NAMESPACE
 #endif // Hadrons_Exceptions_hpp_
--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -1,214 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Global.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Global.hpp>
 using namespace Grid;
 using namespace QCD;
 using namespace Hadrons;
 HadronsLogger Hadrons::HadronsLogError(1,"Error");
 HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
 HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
 HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
 HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
 HadronsLogger Hadrons::HadronsLogIRL(1,"IRL");
 void Hadrons::initLogger(void)
 {
    auto w  = std::string("Hadrons").length();
    int  cw = 8;
    GridLogError.setTopWidth(w);
    GridLogWarning.setTopWidth(w);
    GridLogMessage.setTopWidth(w);
    GridLogIterative.setTopWidth(w);
    GridLogDebug.setTopWidth(w);
    GridLogIRL.setTopWidth(w);
    GridLogError.setChanWidth(cw);
    GridLogWarning.setChanWidth(cw);
    GridLogMessage.setChanWidth(cw);
    GridLogIterative.setChanWidth(cw);
    GridLogDebug.setChanWidth(cw);
    GridLogIRL.setChanWidth(cw);
    HadronsLogError.Active(true);
    HadronsLogWarning.Active(true);
    HadronsLogMessage.Active(GridLogMessage.isActive());
    HadronsLogIterative.Active(GridLogIterative.isActive());
    HadronsLogDebug.Active(GridLogDebug.isActive());
    HadronsLogIRL.Active(GridLogIRL.isActive());
    HadronsLogError.setChanWidth(cw);
    HadronsLogWarning.setChanWidth(cw);
    HadronsLogMessage.setChanWidth(cw);
    HadronsLogIterative.setChanWidth(cw);
    HadronsLogDebug.setChanWidth(cw);
    HadronsLogIRL.setChanWidth(cw);
 }
 // type utilities //////////////////////////////////////////////////////////////
 size_t Hadrons::typeHash(const std::type_info *info)
 {
    return info->hash_code();
 }
 constexpr unsigned int maxNameSize = 1024u;
 std::string Hadrons::typeName(const std::type_info *info)
 {
    char        *buf;
    std::string name;
    buf  = abi::__cxa_demangle(info->name(), nullptr, nullptr, nullptr);
    name = buf;
    free(buf);
    return name;
 }
 // default writers/readers /////////////////////////////////////////////////////
 #ifdef HAVE_HDF5
 const std::string Hadrons::resultFileExt = "h5";
 #else
 const std::string Hadrons::resultFileExt = "xml";
 #endif
 // recursive mkdir /////////////////////////////////////////////////////////////
 int Hadrons::mkdir(const std::string dirName)
 {
    if (!dirName.empty() and access(dirName.c_str(), R_OK|W_OK|X_OK))
    {
        mode_t mode755;
        char   tmp[MAX_PATH_LENGTH];
        char   *p = NULL;
        size_t len;
        mode755 = S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
        snprintf(tmp, sizeof(tmp), "%s", dirName.c_str());
        len = strlen(tmp);
        if(tmp[len - 1] == '/')
        {
            tmp[len - 1] = 0;
        }
        for(p = tmp + 1; *p; p++)
        {
            if(*p == '/')
            {
                *p = 0;
                ::mkdir(tmp, mode755);
                *p = '/';
            }
        }
        return ::mkdir(tmp, mode755);
    }
    else
    {
        return 0;
    }
 }
 std::string Hadrons::basename(const std::string &s)
 {
    constexpr char sep = '/';
    size_t         i   = s.rfind(sep, s.length());
    if (i != std::string::npos)
    {
        return s.substr(i+1, s.length() - i);
    }
    else
    {
        return s;
    }
 }
 std::string Hadrons::dirname(const std::string &s)
 {
    constexpr char sep = '/';
    size_t         i   = s.rfind(sep, s.length());
    if (i != std::string::npos)
    {
        return s.substr(0, i);
    }
    else
    {
        return "";
    }
 }
 void Hadrons::makeFileDir(const std::string filename, GridBase *g)
 {
    bool doIt = true;
    if (g)
    {
        doIt = g->IsBoss();
    }
    if (doIt)
    {
        std::string dir    = dirname(filename);
        int         status = mkdir(dir);
        if (status)
        {
            HADRONS_ERROR(Io, "cannot create directory '" + dir
                          + "' ( " + std::strerror(errno) + ")");
        }
    }
 }
 void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing, 
                               GridTime total)
 {
    typedef decltype(total.count()) Count;
    std::map<Count, std::string, std::greater<Count>> rtiming;
    const double dtotal = static_cast<double>(total.count());
    auto cf = std::cout.flags();
    auto p  = std::cout.precision();
    unsigned int width = 0;
    for (auto &t: timing)
    {
        width = std::max(width, static_cast<unsigned int>(t.first.length()));
        rtiming[t.second.count()] = t.first;
    }
    for (auto &rt: rtiming)
    {
        LOG(Message) << std::setw(width) << rt.second << ": " 
                     << rt.first << " us (" << std::fixed 
                     << std::setprecision(1) 
                     << static_cast<double>(rt.first)/dtotal*100 << "%)"
                     << std::endl;
    }
    std::cout.flags(cf);
    std::cout.precision(p);
 }
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -1,283 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Global.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_Global_hpp_
 #define Hadrons_Global_hpp_
 #include <set>
 #include <stack>
 #include <regex>
 #include <Grid/Grid.h>
 #include <cxxabi.h>
 #ifndef SITE_SIZE_TYPE
 #define SITE_SIZE_TYPE size_t
 #endif
 #ifndef DEFAULT_ASCII_PREC
 #define DEFAULT_ASCII_PREC 16
 #endif
 #define ARG(...) __VA_ARGS__
 /* the 'using Grid::operator<<;' statement prevents a very nasty compilation
 * error with GCC 5 (clang & GCC 6 compile fine without it).
 */
 #define BEGIN_HADRONS_NAMESPACE \
 namespace Grid {\
 using namespace QCD;\
 namespace Hadrons {\
 using Grid::operator<<;\
 using Grid::operator>>;
 #define END_HADRONS_NAMESPACE }}
 #define BEGIN_MODULE_NAMESPACE(name)\
 namespace name {\
 using Grid::operator<<;\
 using Grid::operator>>;
 #define END_MODULE_NAMESPACE }
 #define _HADRONS_IMPL(impl, sub) impl##sub
 #define HADRONS_IMPL(impl, sub)   _HADRONS_IMPL(impl, sub)
 #ifndef FIMPLBASE
 #define FIMPLBASE WilsonImpl
 #endif
 #define FIMPL  HADRONS_IMPL(FIMPLBASE, R)
 #define FIMPLF HADRONS_IMPL(FIMPLBASE, F)
 #define FIMPLD HADRONS_IMPL(FIMPLBASE, D)
 #ifndef ZFIMPLBASE
 #define ZFIMPLBASE ZWilsonImpl
 #endif
 #define ZFIMPL  HADRONS_IMPL(ZFIMPLBASE, R)
 #define ZFIMPLF HADRONS_IMPL(ZFIMPLBASE, F)
 #define ZFIMPLD HADRONS_IMPL(ZFIMPLBASE, D)
 #ifndef SIMPLBASE
 #define SIMPLBASE ScalarImplC
 #endif
 #define SIMPL  HADRONS_IMPL(SIMPLBASE, R)
 #define SIMPLF HADRONS_IMPL(SIMPLBASE, F)
 #define SIMPLD HADRONS_IMPL(SIMPLBASE, D)
 #ifndef GIMPLBASE
 #define GIMPLBASE PeriodicGimpl
 #endif
 #define GIMPL  HADRONS_IMPL(GIMPLBASE, R)
 #define GIMPLF HADRONS_IMPL(GIMPLBASE, F)
 #define GIMPLD HADRONS_IMPL(GIMPLBASE, D)
 BEGIN_HADRONS_NAMESPACE
 // type aliases
 #define BASIC_TYPE_ALIASES(Impl, suffix)\
 typedef typename Impl::Field                         ScalarField##suffix;\
 typedef typename Impl::PropagatorField               PropagatorField##suffix;\
 typedef typename Impl::SitePropagator::scalar_object SitePropagator##suffix;\
 typedef typename Impl::ComplexField                  ComplexField##suffix;\
 typedef std::vector<SitePropagator##suffix>          SlicedPropagator##suffix;\
 typedef std::vector<typename ComplexField##suffix::vector_object::scalar_object> SlicedComplex##suffix;
 #define FERM_TYPE_ALIASES(FImpl, suffix)\
 BASIC_TYPE_ALIASES(FImpl, suffix);\
 typedef FermionOperator<FImpl>                     FMat##suffix;\
 typedef typename FImpl::FermionField               FermionField##suffix;\
 typedef typename FImpl::GaugeField                 GaugeField##suffix;\
 typedef typename FImpl::DoubledGaugeField          DoubledGaugeField##suffix;\
 typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField##suffix;
 #define GAUGE_TYPE_ALIASES(GImpl, suffix)\
 typedef typename GImpl::GaugeField GaugeField##suffix;
 #define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 typedef Solver<FImpl> Solver##suffix;
 #define SINK_TYPE_ALIASES(suffix)\
 typedef std::function<SlicedPropagator##suffix\
                      (const PropagatorField##suffix &)> SinkFn##suffix;
 // logger
 class HadronsLogger: public Logger
 {
 public:
    HadronsLogger(int on, std::string nm): Logger("Hadrons", on, nm,
                                                  GridLogColours, "BLACK"){};
 };
 #define LOG(channel) std::cout << HadronsLog##channel
 #define HADRONS_DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 extern HadronsLogger HadronsLogError;
 extern HadronsLogger HadronsLogWarning;
 extern HadronsLogger HadronsLogMessage;
 extern HadronsLogger HadronsLogIterative;
 extern HadronsLogger HadronsLogDebug;
 extern HadronsLogger HadronsLogIRL;
 void initLogger(void);
 // singleton pattern
 #define SINGLETON(name)\
 public:\
    name(const name &e) = delete;\
    void operator=(const name &e) = delete;\
    static name & getInstance(void)\
    {\
        static name e;\
        return e;\
    }\
 private:\
    name(void);
 #define SINGLETON_DEFCTOR(name)\
 public:\
    name(const name &e) = delete;\
    void operator=(const name &e) = delete;\
    static name & getInstance(void)\
    {\
        static name e;\
        return e;\
    }\
 private:\
    name(void) = default;
 // type utilities
 template <typename T>
 const std::type_info * typeIdPt(const T &x)
 {
    return &typeid(x);
 }
 template <typename T>
 const std::type_info * typeIdPt(void)
 {
    return &typeid(T);
 }
 size_t typeHash(const std::type_info *info);
 template <typename T>
 size_t typeHash(const T &x)
 {
    return typeHash(typeIdPt(x));
 }
 template <typename T>
 size_t typeHash(void)
 {
    return typeHash(typeIdPt<T>());
 }
 std::string typeName(const std::type_info *info);
 template <typename T>
 std::string typeName(const T &x)
 {
    return typeName(typeIdPt(x));
 }
 template <typename T>
 std::string typeName(void)
 {
    return typeName(typeIdPt<T>());
 }
 // default writers/readers
 extern const std::string resultFileExt;
 #ifdef HAVE_HDF5
 typedef Hdf5Reader ResultReader;
 typedef Hdf5Writer ResultWriter;
 #else
 typedef XmlReader ResultReader;
 typedef XmlWriter ResultWriter;
 #endif
 #define RESULT_FILE_NAME(name, traj) \
 name + "." + std::to_string(traj) + "." + resultFileExt
 // recursive mkdir
 #define MAX_PATH_LENGTH 512u
 int         mkdir(const std::string dirName);
 std::string basename(const std::string &s);
 std::string dirname(const std::string &s);
 void        makeFileDir(const std::string filename, GridBase *g = nullptr);
 // default Schur convention
 #ifndef HADRONS_DEFAULT_SCHUR 
 #define HADRONS_DEFAULT_SCHUR DiagTwo
 #endif
 #define _HADRONS_SCHUR_OP_(conv) Schur##conv##Operator
 #define HADRONS_SCHUR_OP(conv) _HADRONS_SCHUR_OP_(conv)
 #define HADRONS_DEFAULT_SCHUR_OP HADRONS_SCHUR_OP(HADRONS_DEFAULT_SCHUR)
 #define _HADRONS_SCHUR_SOLVE_(conv) SchurRedBlack##conv##Solve
 #define HADRONS_SCHUR_SOLVE(conv) _HADRONS_SCHUR_SOLVE_(conv)
 #define HADRONS_DEFAULT_SCHUR_SOLVE HADRONS_SCHUR_SOLVE(HADRONS_DEFAULT_SCHUR)
 #define _HADRONS_SCHUR_A2A_(conv) A2AVectorsSchur##conv
 #define HADRONS_SCHUR_A2A(conv) _HADRONS_SCHUR_A2A_(conv)
 #define HADRONS_DEFAULT_SCHUR_A2A HADRONS_SCHUR_A2A(HADRONS_DEFAULT_SCHUR)
 // stringify macro
 #define _HADRONS_STR(x) #x
 #define HADRONS_STR(x) _HADRONS_STR(x)
 // pretty print time profile
 void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
 // token replacement utility
 template <typename T>
 void tokenReplace(std::string &str, const std::string token,
                  const T &x, const std::string mark = "@")
 {
    std::string fullToken = mark + token + mark;
    auto pos = str.find(fullToken);
    if (pos != std::string::npos)
    {
        str.replace(pos, fullToken.size(), std::to_string(x));
    }
 }
 // generic correlator class
 template <typename Metadata, typename Scalar = Complex>
 struct Correlator: Serializable
 {
    GRID_SERIALIZABLE_CLASS_MEMBERS(ARG(Correlator<Metadata, Scalar>),
                                    Metadata,             info,
                                    std::vector<Complex>, corr);
 };
 END_HADRONS_NAMESPACE
 #include <Hadrons/Exceptions.hpp>
 #endif // Hadrons_Global_hpp_
--- a/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -1,37 +0,0 @@
 SUBDIRS = . Utilities
 lib_LIBRARIES = libHadrons.a
 include modules.inc
 libHadrons_a_SOURCES = \
    Application.cc     \
    Environment.cc     \
 	Exceptions.cc      \
    Global.cc          \
    Module.cc		   \
 	TimerArray.cc      \
 	VirtualMachine.cc  \
 	$(modules_cc)
 libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
 	A2AVectors.hpp            \
 	A2AMatrix.hpp             \
 	Application.hpp           \
 	DilutedNoise.hpp          \
 	DiskVector.hpp            \
 	EigenPack.hpp             \
 	Environment.hpp           \
 	Exceptions.hpp            \
 	Factory.hpp               \
 	GeneticScheduler.hpp      \
 	Global.hpp                \
 	Graph.hpp                 \
 	Module.hpp                \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
 	Solver.hpp                \
 	TimerArray.hpp            \
 	VirtualMachine.hpp        \
 	$(modules_hpp)
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,71 +0,0 @@
 #include <Hadrons/Modules/MSource/Gauss.hpp>
 #include <Hadrons/Modules/MSource/Momentum.hpp>
 #include <Hadrons/Modules/MSource/SeqAslash.hpp>
 #include <Hadrons/Modules/MSource/Z2.hpp>
 #include <Hadrons/Modules/MSource/Point.hpp>
 #include <Hadrons/Modules/MSource/SeqGamma.hpp>
 #include <Hadrons/Modules/MSource/Convolution.hpp>
 #include <Hadrons/Modules/MSource/Wall.hpp>
 #include <Hadrons/Modules/MSource/SeqConserved.hpp>
 #include <Hadrons/Modules/MScalarSUN/Div.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
 #include <Hadrons/Modules/MScalarSUN/Grad.hpp>
 #include <Hadrons/Modules/MScalarSUN/Utils.hpp>
 #include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
 #include <Hadrons/Modules/MScalarSUN/EMT.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
 #include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Hadrons/Modules/MAction/Wilson.hpp>
 #include <Hadrons/Modules/MAction/ScaledDWF.hpp>
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/WilsonClover.hpp>
 #include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/DWF.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
 #include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
 #include <Hadrons/Modules/MGauge/Random.hpp>
 #include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 #include <Hadrons/Modules/MGauge/GaugeFix.hpp>
 #include <Hadrons/Modules/MGauge/Unit.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
 #include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
 #include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
 #include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
 #include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
 #include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
 #include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
 #include <Hadrons/Modules/MContraction/A2ALoop.hpp>
 #include <Hadrons/Modules/MContraction/WeakNonEye3pt.hpp>
 #include <Hadrons/Modules/MContraction/DiscLoop.hpp>
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 #include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MNPR/Bilinear.hpp>
 #include <Hadrons/Modules/MNPR/Amputate.hpp>
 #include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
 #include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
 #include <Hadrons/Modules/MFermion/FreeProp.hpp>
 #include <Hadrons/Modules/MFermion/GaugeProp.hpp>
 #include <Hadrons/Modules/MFermion/EMLepton.hpp>
 #include <Hadrons/Modules/MSink/Smear.hpp>
 #include <Hadrons/Modules/MSink/Point.hpp>
--- a/Hadrons/Modules/MAction/DWF.cc
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/DWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/DWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/MobiusDWF.cc
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/MobiusDWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/MobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -1,156 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/MobiusDWF.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_MobiusDWF_hpp_
 #define Hadrons_MAction_MobiusDWF_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                      Mobius domain-wall fermion action                     *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class MobiusDWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusDWFPar,
                                    std::string , gauge,
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
                                    double      , b,
                                    double      , c,
                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
 class TMobiusDWF: public Module<MobiusDWFPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TMobiusDWF(const std::string name);
    // destructor
    virtual ~TMobiusDWF(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                      TMobiusDWF implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TMobiusDWF<FImpl>::TMobiusDWF(const std::string name)
 : Module<MobiusDWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TMobiusDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TMobiusDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TMobiusDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up Mobius domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
                 << ", b= " << par().b << ", c= " << par().c
                 << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    auto &U    = envGet(GaugeField, par().gauge);
    auto &g4   = *envGetGrid(FermionField);
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    typename MobiusFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
                     implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TMobiusDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_MobiusDWF_hpp_
--- a/Hadrons/Modules/MAction/ScaledDWF.cc
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/ScaledDWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/ScaledDWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/ScaledDWF.hpp
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -1,155 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/ScaledDWF.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_ScaledDWF_hpp_
 #define Hadrons_MAction_ScaledDWF_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                      Scaled domain wall fermion                            *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class ScaledDWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ScaledDWFPar,
                                    std::string , gauge,
                                    unsigned int, Ls,
                                    double      , mass,
                                    double      , M5,
                                    double      , scale,
                                    std::string , boundary,
                                    std::string , twist);
 };
 template <typename FImpl>
 class TScaledDWF: public Module<ScaledDWFPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TScaledDWF(const std::string name);
    // destructor
    virtual ~TScaledDWF(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                      TScaledDWF implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TScaledDWF<FImpl>::TScaledDWF(const std::string name)
 : Module<ScaledDWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TScaledDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TScaledDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TScaledDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up scaled domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
                 << ", scale= " << par().scale
                 << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    auto &U    = envGet(GaugeField, par().gauge);
    auto &g4   = *envGetGrid(FermionField);
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, par().scale,
                     implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TScaledDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_ScaledDWF_hpp_
--- a/Hadrons/Modules/MAction/Wilson.cc
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/Wilson.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/Wilson.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilson<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/WilsonClover.cc
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/WilsonClover.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/WilsonClover.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/WilsonClover.hpp
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -1,157 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/WilsonClover.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: pretidav <david.preti@csic.es>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_WilsonClover_hpp_
 #define Hadrons_MAction_WilsonClover_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         Wilson clover quark action                         *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class WilsonCloverPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverPar,
                                    std::string, gauge,
                                    double     , mass,
 				                    double     , csw_r,
 				                    double     , csw_t,
 				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
                                    std::string, boundary,
                                    std::string, twist
 				    );
 };
 template <typename FImpl>
 class TWilsonClover: public Module<WilsonCloverPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TWilsonClover(const std::string name);
    // destructor
    virtual ~TWilsonClover(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                    TWilsonClover template implementation                   *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWilsonClover<FImpl>::TWilsonClover(const std::string name)
 : Module<WilsonCloverPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWilsonClover<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWilsonClover<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilsonClover<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up Wilson clover fermion matrix with m= " << par().mass
                 << " using gauge field '" << par().gauge << "'" << std::endl;
    LOG(Message) << "Clover term csw_r: " << par().csw_r
                 << " csw_t: " << par().csw_t
                 << std::endl;
    auto &U      = envGet(GaugeField, par().gauge);
    auto &grid   = *envGetGrid(FermionField);
    auto &gridRb = *envGetRbGrid(FermionField);
    typename WilsonCloverFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
                     gridRb, par().mass, par().csw_r, par().csw_t, 
                     par().clover_anisotropy, implParams); 
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWilsonClover<FImpl>::execute()
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_WilsonClover_hpp_
--- a/Hadrons/Modules/MAction/ZMobiusDWF.cc
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -1,37 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/ZMobiusDWF.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MAction;
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
 #endif
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -1,163 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MAction/ZMobiusDWF.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MAction_ZMobiusDWF_hpp_
 #define Hadrons_MAction_ZMobiusDWF_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                      z-Mobius domain-wall fermion action                   *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MAction)
 class ZMobiusDWFPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(ZMobiusDWFPar,
                                    std::string                      , gauge,
                                    unsigned int                     , Ls,
                                    double                           , mass,
                                    double                           , M5,
                                    double                           , b,
                                    double                           , c,
                                    std::vector<std::complex<double>>, omega,
                                    std::string                      , boundary,
                                    std::string                      , twist);
 };
 template <typename FImpl>
 class TZMobiusDWF: public Module<ZMobiusDWFPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
 public:
    // constructor
    TZMobiusDWF(const std::string name);
    // destructor
    virtual ~TZMobiusDWF(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);
 #endif
 /******************************************************************************
 *                     TZMobiusDWF implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TZMobiusDWF<FImpl>::TZMobiusDWF(const std::string name)
 : Module<ZMobiusDWFPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TZMobiusDWF<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().gauge};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TZMobiusDWF<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {getName()};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TZMobiusDWF<FImpl>::setup(void)
 {
    LOG(Message) << "Setting up z-Mobius domain wall fermion matrix with m= "
                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
                 << ", b= " << par().b << ", c= " << par().c
                 << " using gauge field '" << par().gauge << "'"
                 << std::endl;
    LOG(Message) << "Omegas: " << std::endl;
    for (unsigned int i = 0; i < par().omega.size(); ++i)
    {
        LOG(Message) << "  omega[" << i << "]= " << par().omega[i] << std::endl;
    }
    auto &U    = envGet(GaugeField, par().gauge);
    auto &g4   = *envGetGrid(FermionField);
    auto &grb4 = *envGetRbGrid(FermionField);
    auto &g5   = *envGetGrid(FermionField, par().Ls);
    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
    auto omega = par().omega;
    typename ZMobiusFermion<FImpl>::ImplParams implParams;
    if (!par().boundary.empty())
    {
        implParams.boundary_phases = strToVec<Complex>(par().boundary);
    }
    if (!par().twist.empty())
    {
        implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
    }
    LOG(Message) << "Fermion boundary conditions: " << implParams.boundary_phases
                 << std::endl;
    LOG(Message) << "Twists: " << implParams.twist_n_2pi_L
                 << std::endl;
    if (implParams.boundary_phases.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of boundary phase");
    }
    if (implParams.twist_n_2pi_L.size() != env().getNd())
    {
        HADRONS_ERROR(Size, "Wrong number of twist");
    }
    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
                     grb5, g4, grb4, par().mass, par().M5, omega,
                     par().b, par().c, implParams);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TZMobiusDWF<FImpl>::execute(void)
 {}
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MAction_ZMobiusDWF_hpp_
--- a/Hadrons/Modules/MContraction/A2AAslashField.cc
+++ b/Hadrons/Modules/MContraction/A2AAslashField.cc
@@ -1,34 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AAslashField.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2AAslashField<FIMPL, PhotonR>;
--- a/Hadrons/Modules/MContraction/A2AAslashField.hpp
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -1,246 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AAslashField.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_A2AAslashField_hpp_
 #define Hadrons_MContraction_A2AAslashField_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                         A2AAslashField                                 *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class A2AAslashFieldPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldPar,
                                    int, cacheBlock,
                                    int, block,
                                    std::string, left,
                                    std::string, right,
                                    std::string, output,
                                    std::vector<std::string>, emField);
 };
 class A2AAslashFieldMetadata: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldMetadata,
                                    std::string, emFieldName);
 };
 template <typename T, typename FImpl>
 class AslashFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    AslashFieldKernel(const std::vector<LatticeComplex> &emB0,
                      const std::vector<LatticeComplex> &emB1,
                      GridBase *grid)
    : emB0_(emB0), emB1_(emB1), grid_(grid)
    {
        vol_ = 1.;
        for (auto &d: grid_->GlobalDimensions())
        {
            vol_ *= d;
        }
    }
    virtual ~AslashFieldKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
                            const FermionField *right,
                            const unsigned int orthogDim, double &t)
    {
        A2Autils<FImpl>::AslashField(m, left, right, emB0_, emB1_, orthogDim, &t);
    }
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return 0.;
    }
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return 0.;
    }
 private:
    const std::vector<LatticeComplex> &emB0_, &emB1_;
    GridBase                          *grid_;
    double                            vol_;
 };
 template <typename FImpl, typename PhotonImpl>
 class TA2AAslashField: public Module<A2AAslashFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef typename PhotonImpl::GaugeField EmField;
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AAslashFieldMetadata, 
                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef AslashFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
    TA2AAslashField(const std::string name);
    // destructor
    virtual ~TA2AAslashField(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(A2AAslashField, ARG(TA2AAslashField<FIMPL, PhotonR>), MContraction);
 /******************************************************************************
 *                 TA2AAslashField implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 TA2AAslashField<FImpl, PhotonImpl>::TA2AAslashField(const std::string name)
 : Module<A2AAslashFieldPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getInput(void)
 {
    std::vector<std::string> in = par().emField;
    in.push_back(par().left);
    in.push_back(par().right);
    return in;
 }
 template <typename FImpl, typename PhotonImpl>
 std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
 {
    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
           env().getNd() - 1, par().emField.size(), 1, par().block, 
           par().cacheBlock, this);
    envTmp(std::vector<ComplexField>, "B0", 1, 
           par().emField.size(), envGetGrid(ComplexField));
    envTmp(std::vector<ComplexField>, "B1", 1, 
           par().emField.size(), envGetGrid(ComplexField));
    envTmpLat(ComplexField, "Amu");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl, typename PhotonImpl>
 void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
 {
    auto &left  = envGet(std::vector<FermionField>, par().left);
    auto &right = envGet(std::vector<FermionField>, par().right);
    int nt         = env().getDim().back();
    int N_i        = left.size();
    int N_j        = right.size();
    int nem        = par().emField.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    LOG(Message) << "Computing all-to-all A-slash fields" << std::endl;
    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "EM fields:" << std::endl;
    for (auto &name: par().emField)
    {
        LOG(Message) << "  " << name << std::endl;
    }
    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/EM field)" << std::endl;
    // preparing "B" complexified fields
    startTimer("Complexify EM fields");
    envGetTmp(std::vector<ComplexField>, B0);
    envGetTmp(std::vector<ComplexField>, B1);
    for (unsigned int i = 0; i < par().emField.size(); ++i)
    {
        auto &A = envGet(EmField, par().emField[i]);
        envGetTmp(ComplexField, Amu);
        B0[i]  = peekLorentz(A, 0);
        B0[i] += timesI(peekLorentz(A, 1));
        B1[i]  = peekLorentz(A, 2);
        B1[i] += timesI(peekLorentz(A, 3));
    }
    stopTimer("Complexify EM fields");
    // I/O name & metadata lambdas
    auto ionameFn = [this](const unsigned int em, const unsigned int dummy)
    {
        return par().emField[em];
    };
    auto filenameFn = [this, &ionameFn](const unsigned int em, const unsigned int dummy)
    {
        return par().output + "." + std::to_string(vm().getTrajectory()) 
               + "/" + ionameFn(em, dummy) + ".h5";
    };
    auto metadataFn = [this](const unsigned int em, const unsigned int dummy)
    {
        A2AAslashFieldMetadata md;
        md.emFieldName = par().emField[em];
        return md;
    };
    // executing computation
    Kernel kernel(B0, B1, envGetGrid(FermionField));
    envGetTmp(Computation, computation);
    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_A2AAslashField_hpp_
--- a/Hadrons/Modules/MContraction/A2ALoop.cc
+++ b/Hadrons/Modules/MContraction/A2ALoop.cc
@@ -1,34 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2ALoop.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/A2ALoop.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2ALoop<FIMPL>;
--- a/Hadrons/Modules/MContraction/A2AMesonField.cc
+++ b/Hadrons/Modules/MContraction/A2AMesonField.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AMesonField.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TA2AMesonField<FIMPL>;
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -1,315 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/A2AMesonField.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_A2AMesonField_hpp_
 #define Hadrons_MContraction_A2AMesonField_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 BEGIN_HADRONS_NAMESPACE
 /******************************************************************************
 *                     All-to-all meson field creation                        *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class A2AMesonFieldPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
                                    int, cacheBlock,
                                    int, block,
                                    std::string, left,
                                    std::string, right,
                                    std::string, output,
                                    std::string, gammas,
                                    std::vector<std::string>, mom);
 };
 class A2AMesonFieldMetadata: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldMetadata,
                                    std::vector<RealF>, momentum,
                                    Gamma::Algebra, gamma);
 };
 template <typename T, typename FImpl>
 class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
 public:
    typedef typename FImpl::FermionField FermionField;
 public:
    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
                     const std::vector<LatticeComplex> &mom,
                     GridBase *grid)
    : gamma_(gamma), mom_(mom), grid_(grid)
    {
        vol_ = 1.;
        for (auto &d: grid_->GlobalDimensions())
        {
            vol_ *= d;
        }
    }
    virtual ~MesonFieldKernel(void) = default;
    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
                            const FermionField *right,
                            const unsigned int orthogDim, double &t)
    {
        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
    }
    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
    }
    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
    {
        return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
               +  vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
    }
 private:
    const std::vector<Gamma::Algebra> &gamma_;
    const std::vector<LatticeComplex> &mom_;
    GridBase                          *grid_;
    double                            vol_;
 };
 template <typename FImpl>
 class TA2AMesonField : public Module<A2AMesonFieldPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef A2AMatrixBlockComputation<Complex, 
                                      FermionField, 
                                      A2AMesonFieldMetadata, 
                                      HADRONS_A2AM_IO_TYPE> Computation;
    typedef MesonFieldKernel<Complex, FImpl> Kernel;
 public:
    // constructor
    TA2AMesonField(const std::string name);
    // destructor
    virtual ~TA2AMesonField(void){};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 private:
    bool                               hasPhase_{false};
    std::string                        momphName_;
    std::vector<Gamma::Algebra>        gamma_;
    std::vector<std::vector<Real>>     mom_;
 };
 MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
 /******************************************************************************
 *                  TA2AMesonField implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
 : Module<A2AMesonFieldPar>(name)
 , momphName_(name + "_momph")
 {
 }
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TA2AMesonField<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().left, par().right};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TA2AMesonField<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::setup(void)
 {
    gamma_.clear();
    mom_.clear();
    if (par().gammas == "all")
    {
        gamma_ = {
            Gamma::Algebra::Gamma5,
            Gamma::Algebra::Identity,    
            Gamma::Algebra::GammaX,
            Gamma::Algebra::GammaY,
            Gamma::Algebra::GammaZ,
            Gamma::Algebra::GammaT,
            Gamma::Algebra::GammaXGamma5,
            Gamma::Algebra::GammaYGamma5,
            Gamma::Algebra::GammaZGamma5,
            Gamma::Algebra::GammaTGamma5,
            Gamma::Algebra::SigmaXY,
            Gamma::Algebra::SigmaXZ,
            Gamma::Algebra::SigmaXT,
            Gamma::Algebra::SigmaYZ,
            Gamma::Algebra::SigmaYT,
            Gamma::Algebra::SigmaZT
        };
    }
    else
    {
        gamma_ = strToVec<Gamma::Algebra>(par().gammas);
    }
    for (auto &pstr: par().mom)
    {
        auto p = strToVec<Real>(pstr);
        if (p.size() != env().getNd() - 1)
        {
            HADRONS_ERROR(Size, "Momentum has " + std::to_string(p.size())
                                + " components instead of " 
                                + std::to_string(env().getNd() - 1));
        }
        mom_.push_back(p);
    }
    envCache(std::vector<ComplexField>, momphName_, 1, 
             par().mom.size(), envGetGrid(ComplexField));
    envTmpLat(ComplexField, "coor");
    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
           env().getNd() - 1, mom_.size(), gamma_.size(), par().block, 
           par().cacheBlock, this);
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TA2AMesonField<FImpl>::execute(void)
 {
    auto &left  = envGet(std::vector<FermionField>, par().left);
    auto &right = envGet(std::vector<FermionField>, par().right);
    int nt         = env().getDim().back();
    int N_i        = left.size();
    int N_j        = right.size();
    int ngamma     = gamma_.size();
    int nmom       = mom_.size();
    int block      = par().block;
    int cacheBlock = par().cacheBlock;
    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
    LOG(Message) << "Momenta:" << std::endl;
    for (auto &p: mom_)
    {
        LOG(Message) << "  " << p << std::endl;
    }
    LOG(Message) << "Spin bilinears:" << std::endl;
    for (auto &g: gamma_)
    {
        LOG(Message) << "  " << g << std::endl;
    }
    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
                 << "/momentum/bilinear)" << std::endl;
    auto &ph = envGet(std::vector<ComplexField>, momphName_);
    if (!hasPhase_)
    {
        startTimer("Momentum phases");
        for (unsigned int j = 0; j < nmom; ++j)
        {
            Complex           i(0.0,1.0);
            std::vector<Real> p;
            envGetTmp(ComplexField, coor);
            ph[j] = zero;
            for(unsigned int mu = 0; mu < mom_[j].size(); mu++)
            {
                LatticeCoordinate(coor, mu);
                ph[j] = ph[j] + (mom_[j][mu]/env().getDim(mu))*coor;
            }
            ph[j] = exp((Real)(2*M_PI)*i*ph[j]);
        }
        hasPhase_ = true;
        stopTimer("Momentum phases");
    }
    auto ionameFn = [this](const unsigned int m, const unsigned int g)
    {
        std::stringstream ss;
        ss << gamma_[g] << "_";
        for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
        {
            ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
        }
        return ss.str();
    };
    auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
    {
        return par().output + "." + std::to_string(vm().getTrajectory()) 
               + "/" + ionameFn(m, g) + ".h5";
    };
    auto metadataFn = [this](const unsigned int m, const unsigned int g)
    {
        A2AMesonFieldMetadata md;
        for (auto pmu: mom_[m])
        {
            md.momentum.push_back(pmu);
        }
        md.gamma = gamma_[g];
        return md;
    };
    Kernel      kernel(gamma_, ph, envGetGrid(FermionField));
    envGetTmp(Computation, computation);
    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_A2AMesonField_hpp_
--- a/Hadrons/Modules/MContraction/Baryon.cc
+++ b/Hadrons/Modules/MContraction/Baryon.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/Baryon.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/Baryon.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TBaryon<FIMPL,FIMPL,FIMPL>;
--- a/Hadrons/Modules/MContraction/DiscLoop.cc
+++ b/Hadrons/Modules/MContraction/DiscLoop.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/DiscLoop.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/DiscLoop.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TDiscLoop<FIMPL>;
--- a/Hadrons/Modules/MContraction/Gamma3pt.cc
+++ b/Hadrons/Modules/MContraction/Gamma3pt.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/Gamma3pt.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TGamma3pt<FIMPL,FIMPL,FIMPL>;
--- a/Hadrons/Modules/MContraction/Meson.cc
+++ b/Hadrons/Modules/MContraction/Meson.cc
@@ -1,35 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/Meson.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/Meson.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TMeson<FIMPL,FIMPL>;
--- a/Hadrons/Modules/MContraction/WeakEye3pt.cc
+++ b/Hadrons/Modules/MContraction/WeakEye3pt.cc
@@ -1,34 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/WeakEye3pt.cc
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TWeakEye3pt<FIMPL>;
--- a/Hadrons/Modules/MContraction/WeakEye3pt.hpp
+++ b/Hadrons/Modules/MContraction/WeakEye3pt.hpp
@@ -1,200 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/WeakEye3pt.hpp
 Copyright (C) 2015-2019
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Lanny91 <andrew.lawson@gmail.com>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakEye3pt_hpp_
 #define Hadrons_MContraction_WeakEye3pt_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 * Weak Hamiltonian meson 3-pt diagrams, eye topologies.
 * 
 * Schematics:       loop                 |                  
 *                  /-<-¬                 |                             
 *                 /     \                |            qbl     G     qbr
 *                 \     /                |        /----<------*------<----¬         
 *            qbl   \   /    qbr          |       /          /-*-¬          \
 *       /-----<-----* *-----<----¬       |      /          /  G  \          \
 *  gIn *            G G           * gOut | gIn *           \     /  loop    * gOut
 *       \                        /       |      \           \->-/          /   
 *        \                      /        |       \                        /       
 *         \---------->---------/         |        \----------->----------/        
 *                   qs                   |                   qs                  
 *                                        |
 *                one trace               |                two traces
 * 
 * one trace : tr(qbr*gOut*qs*adj(gIn)*g5*adj(qbl)*g5*G*loop*G)
 * two traces: tr(qbr*gOut*qs*adj(gIn)*g5*adj(qbl)*g5*G)*tr(loop*G)
 * 
 */
 BEGIN_MODULE_NAMESPACE(MContraction)
 class WeakEye3ptPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WeakEye3ptPar,
                                    std::string,    qBarLeft,
                                    std::string,    qBarRight,
                                    std::string,    qSpectator,
                                    std::string,    loop,
                                    unsigned int,   tOut,
                                    Gamma::Algebra, gammaIn,
                                    Gamma::Algebra, gammaOut,
                                    std::string,    output);
 };
 template <typename FImpl>
 class TWeakEye3pt: public Module<WeakEye3ptPar>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    class Metadata: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Metadata,
                                        Gamma::Algebra, in,
                                        Gamma::Algebra, out,
                                        Gamma::Algebra, op,
                                        unsigned int,   trace);
    };
    typedef Correlator<Metadata> Result;
 public:
    // constructor
    TWeakEye3pt(const std::string name);
    // destructor
    virtual ~TWeakEye3pt(void) {};
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(WeakEye3pt, TWeakEye3pt<FIMPL>, MContraction);
 /******************************************************************************
 *                        TWeakEye3pt implementation                          *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWeakEye3pt<FImpl>::TWeakEye3pt(const std::string name)
 : Module<WeakEye3ptPar>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWeakEye3pt<FImpl>::getInput(void)
 {
    std::vector<std::string> in = {par().qBarLeft, par().qBarRight, 
                                   par().qSpectator, par().loop};
    return in;
 }
 template <typename FImpl>
 std::vector<std::string> TWeakEye3pt<FImpl>::getOutput(void)
 {
    std::vector<std::string> out = {};
    return out;
 }
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWeakEye3pt<FImpl>::setup(void)
 {
    envTmpLat(ComplexField, "corr");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWeakEye3pt<FImpl>::execute(void)
 {
    LOG(Message) << "Computing mesonic weak 3pt contractions, eye topologies" << std::endl;
    LOG(Message) << "gIn : " << par().gammaIn << std::endl;
    LOG(Message) << "gOut: " << par().gammaIn << std::endl;
    LOG(Message) << "tOut: " << par().tOut << std::endl;
    LOG(Message) << "qbl : " << par().qBarLeft << std::endl;
    LOG(Message) << "qbr : " << par().qBarRight << std::endl;
    LOG(Message) << "qs  : " << par().qSpectator << std::endl;
    LOG(Message) << "loop: " << par().loop << std::endl;
    std::vector<Result> result;
    Result              r;
    auto                &qbl  = envGet(PropagatorField, par().qBarLeft);
    auto                &qbr  = envGet(PropagatorField, par().qBarRight);
    auto                &loop = envGet(PropagatorField, par().loop);
    auto                &qs   = envGet(SlicedPropagator, par().qSpectator);
    auto                qst   = qs[par().tOut];
    Gamma               gIn(par().gammaIn), gOut(par().gammaOut);
    Gamma               g5(Gamma::Algebra::Gamma5);
    envGetTmp(ComplexField, corr);
    r.info.in  = par().gammaIn;
    r.info.out = par().gammaOut;
    for (auto &G: Gamma::gall)
    {
        SlicedComplex buf;
        r.info.op = G.g;
        // one trace
        corr = trace(qbr*gOut*qst*adj(gIn)*g5*adj(qbl)*g5*G*loop*G);
        sliceSum(corr, buf, Tp);
        r.corr.clear();
        for (unsigned int t = 0; t < buf.size(); ++t)
        {
            r.corr.push_back(TensorRemove(buf[t]));
        }
        r.info.trace = 1;
        result.push_back(r);
        // two traces
        corr = trace(qbr*gOut*qst*adj(gIn)*g5*adj(qbl)*g5*G)*trace(loop*G);
        sliceSum(corr, buf, Tp);
        r.corr.clear();
        for (unsigned int t = 0; t < buf.size(); ++t)
        {
            r.corr.push_back(TensorRemove(buf[t]));
        }
        r.info.trace = 2;
        result.push_back(r);
    }
    saveResult(par().output, "weakEye3pt", result);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakEye3pt_hpp_
--- a/Hadrons/Modules/MContraction/WeakMesonDecayKl2.cc
+++ b/Hadrons/Modules/MContraction/WeakMesonDecayKl2.cc
@@ -1,36 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/WeakMesonDecayKl2.cc
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
 using namespace Grid;
 using namespace Hadrons;
 using namespace MContraction;
 template class Grid::Hadrons::MContraction::TWeakMesonDecayKl2<FIMPL>;
--- a/Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp
+++ b/Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp
@@ -1,185 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp
 Copyright (C) 2015-2018
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef Hadrons_MContraction_WeakMesonDecayKl2_hpp_
 #define Hadrons_MContraction_WeakMesonDecayKl2_hpp_
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
 BEGIN_HADRONS_NAMESPACE
 /*
 * Kl2 contraction
 * -----------------------------
 *
 * contraction for Kl2 decay, including the lepton
 *
 * 	trace(q1*adj(q2)*g5*gL[mu]) * (gL[mu] * lepton)_{a,b}
 *
 * with open spinor indices (a,b) for the lepton part
 *
 *             q1                  lepton
 *        /------------\       /------------
 *       /              \     /
 *      /                \H_W/
 * g_5 *                  * * 
 *      \                /
 *       \              / 
 *        \____________/
 *             q2
 *
 * * options:
 * - q1: input propagator 1 (string)
 * - q2: input propagator 2 (string)
 * - lepton: input lepton (string)
 */
 /******************************************************************************
 *                               TWeakMesonDecayKl2                           *
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 class WeakMesonDecayKl2Par: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(WeakMesonDecayKl2Par,
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, lepton,
 				                    std::string, output);
 };
 template <typename FImpl>
 class TWeakMesonDecayKl2: public Module<WeakMesonDecayKl2Par>
 {
 public:
    FERM_TYPE_ALIASES(FImpl,);
    typedef typename SpinMatrixField::vector_object::scalar_object SpinMatrix;
    class Result: Serializable
    {
    public:
        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                        std::vector<SpinMatrix>, corr);
    };
 public:
    // constructor
    TWeakMesonDecayKl2(const std::string name);
    // destructor
    virtual ~TWeakMesonDecayKl2(void) {};
    // dependencies/products
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
 protected:
    // execution
    virtual void setup(void);
    // execution
    virtual void execute(void);
 };
 MODULE_REGISTER_TMP(WeakMesonDecayKl2, TWeakMesonDecayKl2<FIMPL>, MContraction);
 /******************************************************************************
 *                           TWeakMesonDecayKl2 implementation                   *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
 TWeakMesonDecayKl2<FImpl>::TWeakMesonDecayKl2(const std::string name)
 : Module<WeakMesonDecayKl2Par>(name)
 {}
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
 std::vector<std::string> TWeakMesonDecayKl2<FImpl>::getInput(void)
 {
    std::vector<std::string> input = {par().q1, par().q2, par().lepton};
    return input;
 }
 template <typename FImpl>
 std::vector<std::string> TWeakMesonDecayKl2<FImpl>::getOutput(void)
 {
    std::vector<std::string> output = {};
    return output;
 }
 // setup ////////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWeakMesonDecayKl2<FImpl>::setup(void)
 {
    envTmpLat(ComplexField, "c");
    envTmpLat(PropagatorField, "prop_buf");
    envCreateLat(PropagatorField, getName());
    envTmpLat(SpinMatrixField, "buf");
 }
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TWeakMesonDecayKl2<FImpl>::execute(void)
 {
    LOG(Message) << "Computing QED Kl2 contractions '" << getName() << "' using"
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "' and"
 		         << "lepton '"  << par().lepton << "'" << std::endl;
    Gamma                   g5(Gamma::Algebra::Gamma5);
    int                     nt = env().getDim(Tp);
    std::vector<SpinMatrix> res_summed;
    Result                  r;
    auto &res    = envGet(PropagatorField, getName()); res = zero;
    auto &q1     = envGet(PropagatorField, par().q1);
    auto &q2     = envGet(PropagatorField, par().q2);
    auto &lepton = envGet(PropagatorField, par().lepton);
    envGetTmp(SpinMatrixField, buf);
    envGetTmp(ComplexField, c);
    envGetTmp(PropagatorField, prop_buf);  
    for (unsigned int mu = 0; mu < 4; ++mu)
    {
        c = zero;
        //hadronic part: trace(q1*adj(q2)*g5*gL[mu]) 
        c = trace(q1*adj(q2)*g5*GammaL(Gamma::gmu[mu]));
        prop_buf = 1.;
        //multiply lepton part
        res += c * prop_buf * GammaL(Gamma::gmu[mu]) * lepton;
    }
    buf = peekColour(res, 0, 0);
    sliceSum(buf, r.corr, Tp);
    saveResult(par().output, "weakdecay", r);
 }
 END_MODULE_NAMESPACE
 END_HADRONS_NAMESPACE
 #endif // Hadrons_MContraction_WeakMesonDecayKl2_hpp_
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Guido Cossu	f4e6824f22	Minor changes	2017-10-09 09:44:03 +01:00
Guido Cossu	ac5cfd33a6	Fixing a compilation error	2017-10-04 14:29:01 +01:00
Guido Cossu	f605230bbb	Added laplacian operator for smearing sources	2017-10-04 13:54:54 +01:00
		`@@ -1,3 +0,0 @@`
			`#include <Grid/GridCore.h>`

			`int Grid::BinaryIO::latticeWriteMaxRetry = -1;`