diff --git a/.gitignore b/.gitignore
index 399f2f6b..45a3ea53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,6 +83,7 @@ ltmain.sh
 .Trashes
 ehthumbs.db
 Thumbs.db
+.dirstamp
 
 # build directory #
 ###################
@@ -97,11 +98,8 @@ build.sh
 
 # Eigen source #
 ################
-lib/Eigen/*
-
-# FFTW source #
-################
-lib/fftw/*
+Grid/Eigen
+Eigen/*
 
 # libtool macros #
 ##################
@@ -112,14 +110,7 @@ m4/libtool.m4
 ################
 gh-pages/
 
-# Buck files #
-##############
-.buck*
-buck-out
-BUCK
-make-bin-BUCK.sh
-
 # generated sources #
 #####################
-lib/qcd/spin/gamma-gen/*.h
-lib/qcd/spin/gamma-gen/*.cc
+Grid/qcd/spin/gamma-gen/*.h
+Grid/qcd/spin/gamma-gen/*.cc
diff --git a/.travis.yml b/.travis.yml
index 7d8203ce..129fd582 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,6 +9,11 @@ matrix:
     - os:        osx
       osx_image: xcode8.3
       compiler: clang
+      env: PREC=single
+    - os:        osx
+      osx_image: xcode8.3
+      compiler: clang
+      env: PREC=double
       
 before_install:
     - export GRIDDIR=`pwd`
@@ -16,9 +21,11 @@ before_install:
     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
     
 install:
+    - export CWD=`pwd`
+    - echo $CWD
     - export CC=$CC$VERSION
     - export CXX=$CXX$VERSION
     - echo $PATH
@@ -31,16 +38,24 @@ install:
     - which $CXX
     - $CXX --version
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
     
 script:
     - ./bootstrap.sh
     - mkdir build
     - cd build
-    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - mkdir lime
+    - cd lime
+    - mkdir build
+    - cd build
+    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
+    - tar xf lime-1.3.2.tar.gz
+    - cd lime-1.3.2
+    - ./configure --prefix=$CWD/build/lime/install
+    - make -j4
+    - make install
+    - cd $CWD/build
+    - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
     - make -j4 
     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
-    - echo make clean
-    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
-    - make -j4
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
     - make check
diff --git a/lib/DisableWarnings.h b/Grid/DisableWarnings.h
similarity index 100%
rename from lib/DisableWarnings.h
rename to Grid/DisableWarnings.h
diff --git a/lib/Grid.h b/Grid/Grid.h
similarity index 96%
rename from lib/Grid.h
rename to Grid/Grid.h
index 475c00b6..0fdd5268 100644
--- a/lib/Grid.h
+++ b/Grid/Grid.h
@@ -42,7 +42,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/Action.h>
 #include <Grid/qcd/utils/GaugeFix.h>
+NAMESPACE_CHECK(GaugeFix);
 #include <Grid/qcd/smearing/Smearing.h>
+NAMESPACE_CHECK(Smearing);
 #include <Grid/parallelIO/MetaData.h>
 #include <Grid/qcd/hmc/HMC_aggregate.h>
 
diff --git a/lib/GridCore.h b/Grid/GridCore.h
similarity index 97%
rename from lib/GridCore.h
rename to Grid/GridCore.h
index cc1811af..ba0499f6 100644
--- a/lib/GridCore.h
+++ b/Grid/GridCore.h
@@ -38,18 +38,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_BASE_H
 #define GRID_BASE_H
 
+
+
 #include <Grid/DisableWarnings.h>
 #include <Grid/Namespace.h>
 #include <Grid/GridStd.h>
 #include <Grid/threads/Pragmas.h>
 #include <Grid/perfmon/Timer.h>
 #include <Grid/perfmon/PerfCount.h>
+#include <Grid/util/Util.h>
 #include <Grid/log/Log.h>
 #include <Grid/allocator/AlignedAllocator.h>
 #include <Grid/simd/Simd.h>
 #include <Grid/threads/Threads.h>
 #include <Grid/serialisation/Serialisation.h>
-#include <Grid/util/Util.h>
+#include <Grid/util/Sha.h>
 #include <Grid/communicator/Communicator.h> 
 #include <Grid/cartesian/Cartesian.h>    
 #include <Grid/tensors/Tensors.h>      
@@ -58,5 +61,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/stencil/Stencil.h>      
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/algorithms/Algorithms.h>   
+NAMESPACE_CHECK(GridCore)
 
 #endif
diff --git a/lib/GridQCDcore.h b/Grid/GridQCDcore.h
similarity index 98%
rename from lib/GridQCDcore.h
rename to Grid/GridQCDcore.h
index 7f50761f..cae6f43f 100644
--- a/lib/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -38,5 +38,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
+NAMESPACE_CHECK(GridQCDCore);
 
 #endif
diff --git a/lib/GridStd.h b/Grid/GridStd.h
similarity index 100%
rename from lib/GridStd.h
rename to Grid/GridStd.h
diff --git a/lib/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h
similarity index 84%
rename from lib/Grid_Eigen_Dense.h
rename to Grid/Grid_Eigen_Dense.h
index c57148ba..cbe0a389 100644
--- a/lib/Grid_Eigen_Dense.h
+++ b/Grid/Grid_Eigen_Dense.h
@@ -1,5 +1,10 @@
 #include <Grid/GridCore.h>
 #pragma once
+// Force Eigen to use MKL if Grid has been configured with --enable-mkl
+#ifdef USE_MKL
+#define EIGEN_USE_MKL_ALL
+#endif
+
 #if defined __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
diff --git a/lib/Makefile.am b/Grid/Makefile.am
similarity index 51%
rename from lib/Makefile.am
rename to Grid/Makefile.am
index dc33e7cf..b88ea4f2 100644
--- a/lib/Makefile.am
+++ b/Grid/Makefile.am
@@ -21,6 +21,32 @@ if BUILD_HDF5
   extra_headers+=serialisation/Hdf5Type.h
 endif
 
+all: version-cache
+
+version-cache:
+	@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
+		a="uncommited changes";\
+	else\
+		a="clean";\
+	fi;\
+	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
+	if [ -e version-cache ]; then\
+		d=`diff vertmp version-cache`;\
+		if [ "$${d}" != "" ]; then\
+			mv vertmp version-cache;\
+			rm -f Version.h;\
+		fi;\
+	else\
+		mv vertmp version-cache;\
+		rm -f Version.h;\
+	fi;\
+	rm -f vertmp
+
+Version.h:
+	cp version-cache Version.h
+
+.PHONY: version-cache
+
 #
 # Libraries
 #
@@ -30,8 +56,8 @@ include Eigen.inc
 lib_LIBRARIES = libGrid.a
 
 CCFILES += $(extra_sources)
-HFILES  += $(extra_headers)
+HFILES  += $(extra_headers) Config.h Version.h
 
 libGrid_a_SOURCES              = $(CCFILES)
-libGrid_adir                   = $(pkgincludedir)
-nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
+libGrid_adir                   = $(includedir)/Grid
+nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)
diff --git a/lib/Namespace.h b/Grid/Namespace.h
similarity index 87%
rename from lib/Namespace.h
rename to Grid/Namespace.h
index e405bc30..29b229fa 100644
--- a/lib/Namespace.h
+++ b/Grid/Namespace.h
@@ -28,7 +28,11 @@ directory
 /*  END LEGAL */
 #pragma once
 
+#include <type_traits>
+#include <cassert>
+
 #define NAMESPACE_BEGIN(A) namespace A {
 #define NAMESPACE_END(A)   }
 #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
 #define GRID_NAMESPACE_END   NAMESPACE_END(Grid)
+#define NAMESPACE_CHECK(x) struct namespaceTEST##x {};  static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at"  ); 
diff --git a/lib/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h
similarity index 97%
rename from lib/algorithms/Algorithms.h
rename to Grid/algorithms/Algorithms.h
index 070a1019..ef147c53 100644
--- a/lib/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/approx/Forecast.h>
 
+#include <Grid/algorithms/iterative/Deflation.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
diff --git a/lib/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h
similarity index 100%
rename from lib/algorithms/CoarsenedMatrix.h
rename to Grid/algorithms/CoarsenedMatrix.h
diff --git a/lib/algorithms/FFT.h b/Grid/algorithms/FFT.h
similarity index 100%
rename from lib/algorithms/FFT.h
rename to Grid/algorithms/FFT.h
diff --git a/lib/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
similarity index 88%
rename from lib/algorithms/LinearOperator.h
rename to Grid/algorithms/LinearOperator.h
index 8b2ecf57..ced8d987 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -182,12 +182,15 @@ class SchurOperatorBase :  public LinearOperatorBase<Field> {
 public:
   virtual  RealD Mpc      (const Field &in, Field &out) =0;
   virtual  RealD MpcDag   (const Field &in, Field &out) =0;
-  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+  virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) 
+  {
     Field tmp(in.Grid());
+    tmp.Checkerboard() = in.Checkerboard();
     ni=Mpc(in,tmp);
     no=MpcDag(tmp,out);
   }
   virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    out.Checkerboard() = in.Checkerboard();
     MpcDagMpc(in,out,n1,n2);
   }
   virtual void HermOp(const Field &in, Field &out){
@@ -217,11 +220,13 @@ public:
   virtual  RealD Mpc      (const Field &in, Field &out) {
     Field tmp(in.Grid());
     //	std::cout <<"grid pointers: in.Grid()="<< in.Grid() << " out.Grid()=" << out.Grid() << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
+    tmp.Checkerboard() = !in.Checkerboard();
 
     _Mat.Meooe(in,tmp);
     _Mat.MooeeInv(tmp,out);
     _Mat.Meooe(out,tmp);
 
+      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
     _Mat.Mooee(in,out);
     return axpy_norm(out,-1.0,tmp,out);
   }
@@ -305,36 +310,69 @@ template<class Matrix,class Field>
 class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
 protected:
   Matrix &_Mat;
+      Field tmp;
+      RealD mass;
+      double tMpc;
+      double tIP;
+      double tMeo;
+      double taxpby_norm;
+      uint64_t ncall;
 public:
-  SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
+      void Report(void)
+      {
+	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl;
+	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl;
+      }
+      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) 
+      { 
+	assert( _Mat.isTrivialEE() );
+	mass = _Mat.Mass();
+	tMpc=0;
+	tIP =0;
+        tMeo=0;
+        taxpby_norm=0;
+	ncall=0;
+      }
   virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
-    GridLogIterative.TimingMode(1);
-    std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
+	ncall++;
+	tMpc-=usecond();
     n2 = Mpc(in,out);
-    std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
+	tMpc+=usecond();
+	tIP-=usecond();
     ComplexD dot= innerProduct(in,out);
-    std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
+	tIP+=usecond();
     n1 = real(dot);
   }
   virtual void HermOp(const Field &in, Field &out){
-    std::cout << GridLogIterative << " HermOp "<<std::endl;
-    Mpc(in,out);
+	ncall++;
+	tMpc-=usecond();
+	_Mat.Meooe(in,out);
+	_Mat.Meooe(out,tmp);
+	tMpc+=usecond();
+	taxpby_norm-=usecond();
+	axpby(out,-1.0,mass*mass,tmp,in);
+	taxpby_norm+=usecond();
   }
-  virtual  RealD Mpc      (const Field &in, Field &out) {
+  virtual  RealD Mpc      (const Field &in, Field &out) 
+  {
+
     Field tmp(in.Grid());
     Field tmp2(in.Grid());
 
-    std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
+    //    std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
     _Mat.Mooee(in,out);
     _Mat.Mooee(out,tmp);
-    std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
+    //    std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
 
+    tMeo-=usecond();
     _Mat.Meooe(in,out);
-    _Mat.Meooe(out,tmp2);
-    std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
-
-    RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
-    std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
+    _Mat.Meooe(out,tmp);
+    tMeo+=usecond();
+    taxpby_norm-=usecond();
+    RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
+    taxpby_norm+=usecond();
     return nn;
   }
   virtual  RealD MpcDag   (const Field &in, Field &out){
@@ -353,6 +391,12 @@ template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOpera
 template<class Field> class OperatorFunction {
 public:
   virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+      virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
+	assert(in.size()==out.size());
+	for(int k=0;k<in.size();k++){
+	  (*this)(Linop,in[k],out[k]);
+	}
+      };
 };
 
 template<class Field> class LinearFunction {
diff --git a/lib/algorithms/Preconditioner.h b/Grid/algorithms/Preconditioner.h
similarity index 100%
rename from lib/algorithms/Preconditioner.h
rename to Grid/algorithms/Preconditioner.h
diff --git a/lib/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h
similarity index 86%
rename from lib/algorithms/SparseMatrix.h
rename to Grid/algorithms/SparseMatrix.h
index 63b30e95..c1473e56 100644
--- a/lib/algorithms/SparseMatrix.h
+++ b/Grid/algorithms/SparseMatrix.h
@@ -55,6 +55,14 @@ public:
 template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
 public:
   virtual GridBase *RedBlackGrid(void)=0;
+
+      //////////////////////////////////////////////////////////////////////
+      // Query the even even properties to make algorithmic decisions
+      //////////////////////////////////////////////////////////////////////
+      virtual RealD  Mass(void)        { return 0.0; };
+      virtual int    ConstEE(void)     { return 0; }; // Disable assumptions unless overridden
+      virtual int    isTrivialEE(void) { return 0; }; // by a derived class that knows better
+
   // half checkerboard operaions
   virtual  void Meooe    (const Field &in, Field &out)=0;
   virtual  void Mooee    (const Field &in, Field &out)=0;
diff --git a/lib/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
similarity index 100%
rename from lib/algorithms/approx/Chebyshev.h
rename to Grid/algorithms/approx/Chebyshev.h
diff --git a/lib/algorithms/approx/Forecast.h b/Grid/algorithms/approx/Forecast.h
similarity index 100%
rename from lib/algorithms/approx/Forecast.h
rename to Grid/algorithms/approx/Forecast.h
diff --git a/lib/algorithms/approx/LICENSE b/Grid/algorithms/approx/LICENSE
similarity index 100%
rename from lib/algorithms/approx/LICENSE
rename to Grid/algorithms/approx/LICENSE
diff --git a/lib/algorithms/approx/MultiShiftFunction.cc b/Grid/algorithms/approx/MultiShiftFunction.cc
similarity index 100%
rename from lib/algorithms/approx/MultiShiftFunction.cc
rename to Grid/algorithms/approx/MultiShiftFunction.cc
diff --git a/lib/algorithms/approx/MultiShiftFunction.h b/Grid/algorithms/approx/MultiShiftFunction.h
similarity index 100%
rename from lib/algorithms/approx/MultiShiftFunction.h
rename to Grid/algorithms/approx/MultiShiftFunction.h
diff --git a/lib/algorithms/approx/README b/Grid/algorithms/approx/README
similarity index 100%
rename from lib/algorithms/approx/README
rename to Grid/algorithms/approx/README
diff --git a/lib/algorithms/approx/Remez.cc b/Grid/algorithms/approx/Remez.cc
similarity index 100%
rename from lib/algorithms/approx/Remez.cc
rename to Grid/algorithms/approx/Remez.cc
diff --git a/lib/algorithms/approx/Remez.h b/Grid/algorithms/approx/Remez.h
similarity index 100%
rename from lib/algorithms/approx/Remez.h
rename to Grid/algorithms/approx/Remez.h
diff --git a/lib/algorithms/approx/Zolotarev.cc b/Grid/algorithms/approx/Zolotarev.cc
similarity index 100%
rename from lib/algorithms/approx/Zolotarev.cc
rename to Grid/algorithms/approx/Zolotarev.cc
diff --git a/lib/algorithms/approx/Zolotarev.h b/Grid/algorithms/approx/Zolotarev.h
similarity index 100%
rename from lib/algorithms/approx/Zolotarev.h
rename to Grid/algorithms/approx/Zolotarev.h
diff --git a/lib/algorithms/approx/bigfloat.h b/Grid/algorithms/approx/bigfloat.h
similarity index 100%
rename from lib/algorithms/approx/bigfloat.h
rename to Grid/algorithms/approx/bigfloat.h
diff --git a/lib/algorithms/approx/bigfloat_double.h b/Grid/algorithms/approx/bigfloat_double.h
similarity index 100%
rename from lib/algorithms/approx/bigfloat_double.h
rename to Grid/algorithms/approx/bigfloat_double.h
diff --git a/lib/algorithms/iterative/AdefGeneric.h b/Grid/algorithms/iterative/AdefGeneric.h
similarity index 100%
rename from lib/algorithms/iterative/AdefGeneric.h
rename to Grid/algorithms/iterative/AdefGeneric.h
diff --git a/Grid/algorithms/iterative/BlockConjugateGradient.h b/Grid/algorithms/iterative/BlockConjugateGradient.h
new file mode 100644
index 00000000..cfce4aa5
--- /dev/null
+++ b/Grid/algorithms/iterative/BlockConjugateGradient.h
@@ -0,0 +1,694 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
+
+Copyright (C) 2017
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
+
+//////////////////////////////////////////////////////////////////////////
+// Block conjugate gradient. Dimension zero should be the block direction
+//////////////////////////////////////////////////////////////////////////
+template <class Field>
+class BlockConjugateGradient : public OperatorFunction<Field> {
+ public:
+
+  typedef typename Field::scalar_type scomplex;
+
+  int blockDim ;
+  int Nblock;
+
+  BlockCGtype CGtype;
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  Integer PrintInterval; //GridLogMessages or Iterative
+  
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
+  {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  sliceInnerProductMatrix(m_rr,R,R,Orthog);
+
+  // Force manifest hermitian to avoid rounding related
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Q = R C^{-1}
+  //
+  // Q_j  = R_i Cinv(i,j) 
+  //
+  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  sliceMulMatrix(Q,Cinv,R,Orthog);
+}
+// see comments above
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 std::vector<Field> & Q,
+		 const std::vector<Field> & R)
+{
+  InnerProductMatrix(m_rr,R,R);
+
+  m_rr = 0.5*(m_rr+m_rr.adjoint());
+
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  MulMatrix(Q,Cinv,R);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Call one of several implementations
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  if ( CGtype == BlockCGrQ ) {
+    BlockCGrQsolve(Linop,Src,Psi);
+  } else if (CGtype == CGmultiRHS ) {
+    CGmultiRHSsolve(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) 
+{
+  if ( CGtype == BlockCGrQVec ) {
+    BlockCGrQsolveVec(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQ implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  Nblock = B.Grid()->_fdimensions[Orthog];
+/* FAKE */
+  Nblock=8;
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  X.checkerboard = B.checkerboard;
+  conformable(X, B);
+
+  Field tmp(B);
+  Field Q(B);
+  Field D(B);
+  Field Z(B);
+  Field AD(B);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,B,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,B,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,X,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  Linop.HermOp(X, AD);
+  tmp = B - AD;  
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  D=Q;
+
+  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    Linop.HermOp(D, Z);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(X, AD);
+      AD = AD-B;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+//////////////////////////////////////////////////////////////////////////
+// multiRHS conjugate gradient. Dimension zero should be the block direction
+// Use this for spread out across nodes
+//////////////////////////////////////////////////////////////////////////
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  int Orthog = blockDim; // First dimension is block dim
+  Nblock = Src.Grid()->_fdimensions[Orthog];
+
+  std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  Psi.checkerboard = Src.checkerboard;
+  conformable(Psi, Src);
+
+  Field P(Src);
+  Field AP(Src);
+  Field R(Src);
+  
+  std::vector<ComplexD> v_pAp(Nblock);
+  std::vector<RealD> v_rr (Nblock);
+  std::vector<RealD> v_rr_inv(Nblock);
+  std::vector<RealD> v_alpha(Nblock);
+  std::vector<RealD> v_beta(Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,Src,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,Src,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,Psi,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  // Initial search dir is guess
+  Linop.HermOp(Psi, AP);
+
+  R = Src - AP;  
+  P = R;
+  sliceNorm(v_rr,R,Orthog);
+
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch sliceNormTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+
+  SolverTimer.Start();
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    RealD rrsum=0;
+    for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
+
+    MatrixTimer.Start();
+    Linop.HermOp(P, AP);
+    MatrixTimer.Stop();
+
+    // Alpha
+    sliceInnerTimer.Start();
+    sliceInnerProductVector(v_pAp,P,AP,Orthog);
+    sliceInnerTimer.Stop();
+    for(int b=0;b<Nblock;b++){
+      v_alpha[b] = v_rr[b]/real(v_pAp[b]);
+    }
+
+    // Psi, R update
+    sliceMaddTimer.Start();
+    sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
+    sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
+    sliceMaddTimer.Stop();
+
+    // Beta
+    for(int b=0;b<Nblock;b++){
+      v_rr_inv[b] = 1.0/v_rr[b];
+    }
+    sliceNormTimer.Start();
+    sliceNorm(v_rr,R,Orthog);
+    sliceNormTimer.Stop();
+    for(int b=0;b<Nblock;b++){
+      v_beta[b] = v_rr_inv[b] *v_rr[b];
+    }
+
+    // Search update
+    sliceMaddTimer.Start();
+    sliceMaddVector(P,v_beta,P,R,Orthog);
+    sliceMaddTimer.Stop();
+
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    RealD max_resid=0;
+    for(int b=0;b<Nblock;b++){
+      RealD rr = v_rr[b]/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+    
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(Psi, AP);
+      AP = AP-Src;
+      std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+
+
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
+  for(int b=0;b<Nblock;b++){
+  for(int bp=0;bp<Nblock;bp++) {
+    m(b,bp) = innerProduct(X[b],Y[bp]);  
+  }}
+}
+void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
+  // Should make this cache friendly with site outermost, parallel_for
+  // Deal with case AP aliases with either Y or X
+  std::vector<Field> tmp(Nblock,X[0]);
+  for(int b=0;b<Nblock;b++){
+    tmp[b]   = Y[b];
+    for(int bp=0;bp<Nblock;bp++) {
+      tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; 
+    }
+  }
+  for(int b=0;b<Nblock;b++){
+    AP[b] = tmp[b];
+  }
+}
+void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
+  // Should make this cache friendly with site outermost, parallel_for
+  for(int b=0;b<Nblock;b++){
+    AP[b] = Zero();
+    for(int bp=0;bp<Nblock;bp++) {
+      AP[b] += (m(bp,b))*X[bp]; 
+    }
+  }
+}
+double normv(const std::vector<Field> &P){
+  double nn = 0.0;
+  for(int b=0;b<Nblock;b++) {
+    nn+=norm2(P[b]);
+  }
+  return nn;
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQvec implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) 
+{
+  Nblock = B.size();
+  assert(Nblock == X.size());
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
+
+  for(int b=0;b<Nblock;b++){ 
+    X[b].checkerboard = B[b].checkerboard;
+    conformable(X[b], B[b]);
+    conformable(X[b], X[0]); 
+  }
+
+  Field Fake(B[0]);
+
+  std::vector<Field> tmp(Nblock,Fake);
+  std::vector<Field>   Q(Nblock,Fake);
+  std::vector<Field>   D(Nblock,Fake);
+  std::vector<Field>   Z(Nblock,Fake);
+  std::vector<Field>  AD(Nblock,Fake);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+  for(int b=0;b<Nblock;b++) {
+    Linop.HermOp(X[b], AD[b]);
+    tmp[b] = B[b] - AD[b];  
+  }
+
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+
+  for(int b=0;b<Nblock;b++) D[b]=Q[b];
+
+  std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    InnerProductMatrix(m_DZ,D,Z);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+    
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    MaddMatrix(X,m_tmp, D,X);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    MaddMatrix(tmp,m_M,Z,Q,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    MaddMatrix(D,m_tmp,D,Q);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
+      for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+
+};
+
+NAMESPACE_END(Grid);
+
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h
similarity index 83%
rename from lib/algorithms/iterative/ConjugateGradient.h
rename to Grid/algorithms/iterative/ConjugateGradient.h
index 5b9f5db7..e818e059 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@@ -55,6 +55,7 @@ public:
   void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
 
     psi.Checkerboard() = src.Checkerboard();
+
     conformable(psi, src);
 
     RealD cp, c, a, d, b, ssq, qq, b_pred;
@@ -70,7 +71,6 @@ public:
     
     Linop.HermOpAndNorm(psi, mmp, d, b);
     
-
     r = src - mmp;
     p = r;
 
@@ -96,38 +96,47 @@ public:
               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
 
     GridStopWatch LinalgTimer;
+    GridStopWatch InnerTimer;
+    GridStopWatch AxpyNormTimer;
+    GridStopWatch LinearCombTimer;
     GridStopWatch MatrixTimer;
     GridStopWatch SolverTimer;
 
     SolverTimer.Start();
     int k;
-    for (k = 1; k <= MaxIterations; k++) {
+    for (k = 1; k <= MaxIterations*1000; k++) {
       c = cp;
 
       MatrixTimer.Start();
-      Linop.HermOpAndNorm(p, mmp, d, qq);
+      Linop.HermOp(p, mmp);
       MatrixTimer.Stop();
 
       LinalgTimer.Start();
-      //  RealD    qqck = norm2(mmp);
-      //  ComplexD dck  = innerProduct(p,mmp);
 
+      InnerTimer.Start();
+      ComplexD dc  = innerProduct(p,mmp);
+      InnerTimer.Stop();
+      d = dc.real();
       a = c / d;
-      b_pred = a * (a * qq - d) / c;
 
+      AxpyNormTimer.Start();
       cp = axpy_norm(r, -a, mmp, r);
+      AxpyNormTimer.Stop();
       b = cp / c;
 
-      // Fuse these loops ; should be really easy
-      psi = a * p + psi;
-      p = p * b + r;
-
+      LinearCombTimer.Start();
+      auto psi_v = psi.View();
+      auto p_v   = p.View();
+      auto r_v   = r.View();
+      parallel_for(int ss=0;ss<src.Grid()->oSites();ss++){
+	vstream(psi_v[ss], a      *  p_v[ss] + psi_v[ss]);
+	vstream(p_v  [ss], b      *  p_v[ss] + r_v[ss]);
+      }
+      LinearCombTimer.Stop();
       LinalgTimer.Stop();
 
       std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
-                << " residual " << cp << " target " << rsq << std::endl;
-      std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
-      std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
+                << " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
 
       // Stopping condition
       if (cp <= rsq) {
@@ -148,6 +157,9 @@ public:
 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
 
         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 
diff --git a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
similarity index 100%
rename from lib/algorithms/iterative/ConjugateGradientMixedPrec.h
rename to Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
similarity index 86%
rename from lib/algorithms/iterative/ConjugateGradientMultiShift.h
rename to Grid/algorithms/iterative/ConjugateGradientMultiShift.h
index 617eff2b..e2c19a4b 100644
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -43,6 +43,7 @@ class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
 public:                                                
   RealD   Tolerance;
   Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   int verbose;
   MultiShiftFunction shifts;
 
@@ -164,6 +165,15 @@ public:
       axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
     }
   
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch AXPYTimer;
+  GridStopWatch ShiftTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
   
     // Iteration loop
     int k;
@@ -171,7 +181,9 @@ public:
     for (k=1;k<=MaxIterations;k++){
     
       a = c /cp;
+    AXPYTimer.Start();
       axpy(p,a,p,r);
+    AXPYTimer.Stop();
     
       // Note to self - direction ps is iterated seperately
       // for each shift. Does not appear to have any scope
@@ -180,6 +192,7 @@ public:
       // However SAME r is used. Could load "r" and update
       // ALL ps[s]. 2/3 Bandwidth saving
       // New Kernel: Load r, vector of coeffs, vector of pointers ps
+    AXPYTimer.Start();
       for(int s=0;s<nshift;s++){
 	if ( ! converged[s] ) { 
 	  if (s==0){
@@ -190,22 +203,34 @@ public:
 	  }
 	}
       }
+    AXPYTimer.Stop();
     
       cp=c;
+    MatrixTimer.Start();  
+    //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
+    // The below is faster on KNL
+    Linop.HermOp(p,mmp); 
+    d=real(innerProduct(p,mmp));
     
-      Linop.HermOpAndNorm(p,mmp,d,qq);
+    MatrixTimer.Stop();  
+
+    AXPYTimer.Start();
       axpy(mmp,mass[0],p,mmp);
+    AXPYTimer.Stop();
       RealD rn = norm2(p);
       d += rn*mass[0];
     
       bp=b;
       b=-cp/d;
     
+    AXPYTimer.Start();
       c=axpy_norm(r,b,mmp,r);
+    AXPYTimer.Stop();
 
       // Toggle the recurrence history
       bs[0] = b;
       iz = 1-iz;
+    ShiftTimer.Start();
       for(int s=1;s<nshift;s++){
 	if((!converged[s])){
 	  RealD z0 = z[s][1-iz];
@@ -215,6 +240,7 @@ public:
 	  bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
 	}
       }
+    ShiftTimer.Stop();
     
       for(int s=0;s<nshift;s++){
 	int ss = s;
@@ -257,6 +283,9 @@ public:
     
       if ( all_converged ){
 
+    SolverTimer.Stop();
+
+
 	std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
 	std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
       
@@ -269,8 +298,19 @@ public:
 	  RealD cn = norm2(src);
 	  std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
 	}
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl;
+
+      IterationsToComplete = k;	
+
 	return;
       }
+
+   
     }
     // ugly hack
     std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
diff --git a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h b/Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
similarity index 100%
rename from lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
rename to Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h
diff --git a/lib/algorithms/iterative/ConjugateResidual.h b/Grid/algorithms/iterative/ConjugateResidual.h
similarity index 100%
rename from lib/algorithms/iterative/ConjugateResidual.h
rename to Grid/algorithms/iterative/ConjugateResidual.h
diff --git a/Grid/algorithms/iterative/Deflation.h b/Grid/algorithms/iterative/Deflation.h
new file mode 100644
index 00000000..509970c7
--- /dev/null
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -0,0 +1,104 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_DEFLATION_H
+#define GRID_DEFLATION_H
+
+namespace Grid { 
+
+template<class Field>
+class ZeroGuesser: public LinearFunction<Field> {
+public:
+    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
+};
+
+template<class Field>
+class SourceGuesser: public LinearFunction<Field> {
+public:
+  virtual void operator()(const Field &src, Field &guess) { guess = src; };
+};
+
+////////////////////////////////
+// Fine grid deflation
+////////////////////////////////
+template<class Field>
+class DeflatedGuesser: public LinearFunction<Field> {
+private:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+
+public:
+
+  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
+
+  virtual void operator()(const Field &src,Field &guess) {
+    guess = Zero();
+    assert(evec.size()==eval.size());
+    auto N = evec.size();
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
+    }
+    guess.Checkerboard() = src.Checkerboard();
+  }
+};
+
+template<class FineField, class CoarseField>
+class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
+private:
+  const std::vector<FineField>   &subspace;
+  const std::vector<CoarseField> &evec_coarse;
+  const std::vector<RealD>       &eval_coarse;
+public:
+  
+  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
+				const std::vector<CoarseField> &_evec_coarse,
+				const std::vector<RealD>       &_eval_coarse)
+    : subspace(_subspace), 
+      evec_coarse(_evec_coarse), 
+      eval_coarse(_eval_coarse)  
+  {
+  }
+  
+  void operator()(const FineField &src,FineField &guess) { 
+    int N = (int)evec_coarse.size();
+    CoarseField src_coarse(evec_coarse[0].Grid());
+    CoarseField guess_coarse(evec_coarse[0].Grid());    guess_coarse = Zero();
+    blockProject(src_coarse,src,subspace);    
+    for (int i=0;i<N;i++) {
+      const CoarseField & tmp = evec_coarse[i];
+      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
+    }
+    blockPromote(guess_coarse,guess,subspace);
+    guess.Checkerboard() = src.Checkerboard();
+  };
+};
+
+
+
+}
+#endif
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
similarity index 98%
rename from lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
rename to Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
index cbe3b75e..6b92b67a 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -53,7 +53,8 @@ template<class Field>
 void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
   typedef decltype(basis[0].View()) View;
-  std::vector<View> basis_v(basis.size());
+  auto tmp_v = basis[0].View();
+  std::vector<View> basis_v(basis.size(),tmp_v);
   typedef typename Field::vector_object vobj;
   GridBase* grid = basis[0].Grid();
       
@@ -63,7 +64,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
 
   thread_region
   {
-    std::vector < vobj > B(Nm); // Thread private
+    std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
     thread_loop_in_region( (int ss=0;ss < grid->oSites();ss++),{
       for(int j=j0; j<j1; ++j) B[j]=0.;
       
@@ -188,6 +189,7 @@ enum IRLdiagonalisation {
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
  public:
+
   LinearFunction<Field>       &_HermOp;
   ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  };
   int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -250,6 +252,7 @@ class ImplicitlyRestartedLanczos {
   /////////////////////////
   
 public:       
+
   //////////////////////////////////////////////////////////////////
   // PAB:
   //////////////////////////////////////////////////////////////////
@@ -497,15 +500,13 @@ until convergence
 	Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
 
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
+	int allconv =1;
 	for(int jj = 1; jj<=Nstop; jj*=2){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
-	    if ( j > Nconv ) {
-	      Nconv=j+1;
-	      jj=Nstop; // Terminate the scan
-	    }
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
+	    allconv=0;
 	  }
 	}
 	// Do evec[0] for good measure
@@ -513,8 +514,10 @@ until convergence
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
 	}
+	if ( allconv ) Nconv = Nstop;
+
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){
diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
similarity index 71%
rename from lib/algorithms/iterative/LocalCoherenceLanczos.h
rename to Grid/algorithms/iterative/LocalCoherenceLanczos.h
index ca5193df..40b4b347 100644
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h
@@ -47,6 +47,7 @@ public:
 struct LocalCoherenceLanczosParams : Serializable {
 public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
+				  bool, saveEvecs,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,
@@ -72,21 +73,24 @@ public:
   typedef Lattice<Fobj>          FineField;
 
   LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
 
-  ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
-    _Linop(linop),
-    _Aggregate(aggregate)  {  };
+  ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : 
+    _Linop(linop), subspace(_subspace)
+  {  
+    assert(subspace.size() >0);
+  };
 
   void operator()(const CoarseField& in, CoarseField& out) {
+    GridBase *FineGrid = subspace[0].Grid();    
+    int   checkerboard = subspace[0].Checkerboard();
 
-    GridBase *FineGrid = _Aggregate.FineGrid;
-    FineField fin(FineGrid);
-    FineField fout(FineGrid);
+    FineField fin (FineGrid);     fin.Checkerboard()= checkerboard;
+    FineField fout(FineGrid);   fout.Checkerboard() = checkerboard;
 
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
+    blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
     _Linop.HermOp(fin,fout);                   std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
   }
 };
 
@@ -101,24 +105,27 @@ public:
 
   OperatorFunction<FineField>   & _poly;
   LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
+  std::vector<FineField>        &subspace;
 
-  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop, 
-			  Aggregation<Fobj,CComplex,nbasis> &aggregate) : 
+  ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
+			  LinearOperatorBase<FineField>& linop, 
+			  std::vector<FineField> & _subspace) :
     _poly(poly),
     _Linop(linop),
-    _Aggregate(aggregate)  {  };
+    subspace(_subspace)
+  {  };
 
   void operator()(const CoarseField& in, CoarseField& out) {
 
-    GridBase *FineGrid = _Aggregate.FineGrid;
+    GridBase *FineGrid = subspace[0].Grid();    
+    int   checkerboard = subspace[0].Checkerboard();
 
-    FineField fin(FineGrid) ;fin.Checkerboard()  =_Aggregate.Checkerboard();
-    FineField fout(FineGrid);fout.Checkerboard() =_Aggregate.Checkerboard();
+    FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
+    FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
     
-    _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
+    blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
     _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
-    _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
+    blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
   }
 };
 
@@ -134,19 +141,23 @@ public:
   LinearFunction<CoarseField> & _Poly;
   OperatorFunction<FineField>   & _smoother;
   LinearOperatorBase<FineField> &_Linop;
-  Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
   RealD                             _coarse_relax_tol;
+  std::vector<FineField>        &_subspace;
+  
   ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly,
 					   OperatorFunction<FineField>   &smoother,
 					   LinearOperatorBase<FineField> &Linop,
-					   Aggregation<Fobj,CComplex,nbasis> &Aggregate,
+					   std::vector<FineField>        &subspace,
 					   RealD coarse_relax_tol=5.0e3) 
-    : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    };
+    : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
+      _coarse_relax_tol(coarse_relax_tol)  
+  {    };
 
   int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
   {
     CoarseField v(B);
     RealD eval_poly = eval;
+
     // Apply operator
     _Poly(B,v);
 
@@ -170,14 +181,13 @@ public:
   }
   int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
   {
-    GridBase *FineGrid = _Aggregate.FineGrid;
-
-    int checkerboard   = _Aggregate.Checkerboard();
-
+    GridBase *FineGrid = _subspace[0].Grid();    
+    int checkerboard   = _subspace[0].Checkerboard();
     FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
     FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
 
-    _Aggregate.PromoteFromSubspace(B,fv);
+    blockPromote(B,fv,_subspace);  
+    
     _smoother(_Linop,fv,fB); 
 
     RealD eval_poly = eval;
@@ -219,27 +229,67 @@ protected:
   int _checkerboard;
   LinearOperatorBase<FineField>                 & _FineOp;
   
-  // FIXME replace Aggregation with vector of fine; the code reuse is too small for
-  // the hassle and complexity of cross coupling.
-  Aggregation<Fobj,CComplex,nbasis>               _Aggregate;  
-  std::vector<RealD>                              evals_fine;
-  std::vector<RealD>                              evals_coarse; 
-  std::vector<CoarseField>                        evec_coarse;
+  std::vector<RealD>                              &evals_fine;
+  std::vector<RealD>                              &evals_coarse; 
+  std::vector<FineField>                          &subspace;
+  std::vector<CoarseField>                        &evec_coarse;
+
+private:
+  std::vector<RealD>                              _evals_fine;
+  std::vector<RealD>                              _evals_coarse; 
+  std::vector<FineField>                          _subspace;
+  std::vector<CoarseField>                        _evec_coarse;
+
 public:
+
   LocalCoherenceLanczos(GridBase *FineGrid,
 			GridBase *CoarseGrid,
 			LinearOperatorBase<FineField> &FineOp,
 			int checkerboard) :
     _CoarseGrid(CoarseGrid),
     _FineGrid(FineGrid),
-    _Aggregate(CoarseGrid,FineGrid,checkerboard),
     _FineOp(FineOp),
-    _checkerboard(checkerboard)
+    _checkerboard(checkerboard),
+    evals_fine  (_evals_fine),
+    evals_coarse(_evals_coarse),
+    subspace    (_subspace),
+    evec_coarse(_evec_coarse)
   {
     evals_fine.resize(0);
     evals_coarse.resize(0);
   };
-  void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
+  //////////////////////////////////////////////////////////////////////////
+  // Alternate constructore, external storage for use by Hadrons module
+  //////////////////////////////////////////////////////////////////////////
+  LocalCoherenceLanczos(GridBase *FineGrid,
+			GridBase *CoarseGrid,
+			LinearOperatorBase<FineField> &FineOp,
+			int checkerboard,
+			std::vector<FineField>   &ext_subspace,
+			std::vector<CoarseField> &ext_coarse,
+			std::vector<RealD>       &ext_eval_fine,
+			std::vector<RealD>       &ext_eval_coarse
+			) :
+    _CoarseGrid(CoarseGrid),
+    _FineGrid(FineGrid),
+    _FineOp(FineOp),
+    _checkerboard(checkerboard),
+    evals_fine  (ext_eval_fine), 
+    evals_coarse(ext_eval_coarse),
+    subspace    (ext_subspace),
+    evec_coarse (ext_coarse)
+  {
+    evals_fine.resize(0);
+    evals_coarse.resize(0);
+  };
+
+  void Orthogonalise(void ) {
+    CoarseScalar InnerProd(_CoarseGrid);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+    std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
+    blockOrthogonalise(InnerProd,subspace);
+  };
 
   template<typename T>  static RealD normalise(T& v) 
   {
@@ -248,43 +298,44 @@ public:
     v = v * (1.0/nn);
     return nn;
   }
-
+  /*
   void fakeFine(void)
   {
     int Nk = nbasis;
-    _Aggregate.subspace.resize(Nk,_FineGrid);
-    _Aggregate.subspace[0]=1.0;
-    _Aggregate.subspace[0].Checkerboard()=_checkerboard;
-    normalise(_Aggregate.subspace[0]);
+    subspace.resize(Nk,_FineGrid);
+    subspace[0]=1.0;
+    subspace[0].Checkerboard()=_checkerboard;
+    normalise(subspace[0]);
     PlainHermOp<FineField>    Op(_FineOp);
     for(int k=1;k<Nk;k++){
-      _Aggregate.subspace[k].Checkerboard()=_checkerboard;
-      Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
-      normalise(_Aggregate.subspace[k]);
+      subspace[k].Checkerboard()=_checkerboard;
+      Op(subspace[k-1],subspace[k]);
+      normalise(subspace[k]);
     }
   }
+  */
 
   void testFine(RealD resid) 
   {
     assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
     PlainHermOp<FineField>    Op(_FineOp);
     ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
     for(int k=0;k<nbasis;k++){
-      assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
+      assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
     }
   }
 
   void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) 
   {
     assert(evals_fine.size() == nbasis);
-    assert(_Aggregate.subspace.size() == nbasis);
+    assert(subspace.size() == nbasis);
     //////////////////////////////////////////////////////////////////////////////////////////////////
     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
     //////////////////////////////////////////////////////////////////////////////////////////////////
     Chebyshev<FineField>                          ChebySmooth(cheby_smooth);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
 
     for(int k=0;k<evec_coarse.size();k++){
       if ( k < nbasis ) { 
@@ -304,34 +355,34 @@ public:
     PlainHermOp<FineField>    Op(_FineOp);
 
     evals_fine.resize(Nm);
-    _Aggregate.subspace.resize(Nm,_FineGrid);
+    subspace.resize(Nm,_FineGrid);
 
     ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
 
     FineField src(_FineGrid); src=1.0; src.Checkerboard() = _checkerboard;
 
     int Nconv;
-    IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
+    IRL.calc(evals_fine,subspace,src,Nconv,false);
     
     // Shrink down to number saved
     assert(Nstop>=nbasis);
     assert(Nconv>=nbasis);
     evals_fine.resize(nbasis);
-    _Aggregate.subspace.resize(nbasis,_FineGrid);
+    subspace.resize(nbasis,_FineGrid);
   }
   void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
 		  int Nstop, int Nk, int Nm,RealD resid, 
 		  RealD MaxIt, RealD betastp, int MinRes)
   {
     Chebyshev<FineField>                          Cheby(cheby_op);
-    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate);
-    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
+    ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,subspace);
+    ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
     //////////////////////////////////////////////////////////////////////////////////////////////////
     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
     //////////////////////////////////////////////////////////////////////////////////////////////////
 
     Chebyshev<FineField>                                           ChebySmooth(cheby_smooth);
-    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
+    ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
 
     evals_coarse.resize(Nm);
     evec_coarse.resize(Nm,_CoarseGrid);
diff --git a/lib/algorithms/iterative/NormalEquations.h b/Grid/algorithms/iterative/NormalEquations.h
similarity index 100%
rename from lib/algorithms/iterative/NormalEquations.h
rename to Grid/algorithms/iterative/NormalEquations.h
diff --git a/lib/algorithms/iterative/PrecConjugateResidual.h b/Grid/algorithms/iterative/PrecConjugateResidual.h
similarity index 100%
rename from lib/algorithms/iterative/PrecConjugateResidual.h
rename to Grid/algorithms/iterative/PrecConjugateResidual.h
diff --git a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
similarity index 100%
rename from lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
rename to Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h
diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h
new file mode 100644
index 00000000..4b9a2ed8
--- /dev/null
+++ b/Grid/algorithms/iterative/SchurRedBlack.h
@@ -0,0 +1,473 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_SCHUR_RED_BLACK_H
+#define GRID_SCHUR_RED_BLACK_H
+
+
+  /*
+   * Red black Schur decomposition
+   *
+   *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+   *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+   *                =         L                     D                     U
+   *
+   * L^-1 = (1              0 )
+   *        (-MoeMee^{-1}   1 )   
+   * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
+   *           ( 0       1                    )
+   *
+   * U^-1 = (1   -Mee^{-1} Meo)
+   *        (0    1           )
+   * U^{dag} = ( 1                 0)
+   *           (Meo^dag Mee^{-dag} 1)
+   * U^{-dag} = (  1                 0)
+   *            (-Meo^dag Mee^{-dag} 1)
+   ***********************
+   *     M psi = eta
+   ***********************
+   *Odd
+   * i)                 D_oo psi_o =  L^{-1}  eta_o
+   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Wilson:
+   *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
+   * Stag:
+   *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * L^-1 eta_o= (1              0 ) (e
+   *             (-MoeMee^{-1}   1 )   
+   *
+   *Even
+   * ii)  Mee psi_e + Meo psi_o = src_e
+   *
+   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+   *
+   * 
+   * TODO: Other options:
+   * 
+   * a) change checkerboards for Schur e<->o
+   *
+   * Left precon by Moo^-1
+   * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
+   *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
+   *
+   * Right precon by Moo^-1
+   * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
+   *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
+   *                              psi_o = M_oo^-1 phi_o
+   * TODO: Deflation 
+   */
+namespace Grid {
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Use base class to share code
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Take a matrix and form a Red Black solver calling a Herm solver
+  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackBase {
+  protected:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+    OperatorFunction<Field> & _HermitianRBSolver;
+    int CBfactorise;
+    bool subGuess;
+  public:
+
+    SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  :
+    _HermitianRBSolver(HermitianRBSolver) 
+    { 
+      CBfactorise = 0;
+      subtractGuess(initSubGuess);
+    };
+    void subtractGuess(const bool initSubGuess)
+    {
+      subGuess = initSubGuess;
+    }
+    bool isSubtractGuess(void)
+    {
+      return subGuess;
+    }
+
+    /////////////////////////////////////////////////////////////
+    // Shared code
+    /////////////////////////////////////////////////////////////
+    void operator() (Matrix & _Matrix,const Field &in, Field &out){
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) 
+    {
+      ZeroGuesser<Field> guess;
+      (*this)(_Matrix,in,out,guess);
+    }
+
+    template<class Guesser>
+    void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) 
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+      int nblock = in.size();
+
+      std::vector<Field> src_o(nblock,grid);
+      std::vector<Field> sol_o(nblock,grid);
+      
+      std::vector<Field> guess_save;
+
+      Field resid(fgrid);
+      Field tmp(grid);
+
+      ////////////////////////////////////////////////
+      // Prepare RedBlack source
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++){
+	RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
+      }
+      ////////////////////////////////////////////////
+      // Make the guesses
+      ////////////////////////////////////////////////
+      if ( subGuess ) guess_save.resize(nblock,grid);
+
+      for(int b=0;b<nblock;b++){
+	guess(src_o[b],sol_o[b]); 
+
+	if ( subGuess ) { 
+	  guess_save[b] = sol_o[b];
+	}
+      }
+      //////////////////////////////////////////////////////////////
+      // Call the block solver
+      //////////////////////////////////////////////////////////////
+      std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // A2A boolean behavioural control & reconstruct other checkerboard
+      ////////////////////////////////////////////////
+      for(int b=0;b<nblock;b++) {
+
+	if (subGuess)   sol_o[b] = sol_o[b] - guess_save[b];
+
+	///////// Needs even source //////////////
+	pickCheckerboard(Even,tmp,in[b]);
+	RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
+
+	/////////////////////////////////////////////////
+	// Check unprec residual if possible
+	/////////////////////////////////////////////////
+	if ( ! subGuess ) {
+	  _Matrix.M(out[b],resid); 
+	  resid = resid-in[b];
+	  RealD ns = norm2(in[b]);
+	  RealD nr = norm2(resid);
+	
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
+	}
+
+      }
+    }
+    template<class Guesser>
+    void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
+
+      // FIXME CGdiagonalMee not implemented virtual function
+      // FIXME use CBfactorise to control schur decomp
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field resid(fgrid);
+      Field src_o(grid);
+      Field src_e(grid);
+      Field sol_o(grid);
+
+      ////////////////////////////////////////////////
+      // RedBlack source
+      ////////////////////////////////////////////////
+      RedBlackSource(_Matrix,in,src_e,src_o);
+
+      ////////////////////////////////
+      // Construct the guess
+      ////////////////////////////////
+      Field   tmp(grid);
+      guess(src_o,sol_o);
+
+      Field  guess_save(grid);
+      guess_save = sol_o;
+
+      //////////////////////////////////////////////////////////////
+      // Call the red-black solver
+      //////////////////////////////////////////////////////////////
+      RedBlackSolve(_Matrix,src_o,sol_o);
+
+      ////////////////////////////////////////////////
+      // Fionn A2A boolean behavioural control
+      ////////////////////////////////////////////////
+      if (subGuess)      sol_o= sol_o-guess_save;
+
+      ///////////////////////////////////////////////////
+      // RedBlack solution needs the even source
+      ///////////////////////////////////////////////////
+      RedBlackSolution(_Matrix,sol_o,src_e,out);
+
+      // Verify the unprec residual
+      if ( ! subGuess ) {
+        _Matrix.M(out,resid); 
+        resid = resid-in;
+        RealD ns = norm2(in);
+        RealD nr = norm2(resid);
+
+        std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
+      } else {
+        std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
+      }
+    }     
+    
+    /////////////////////////////////////////////////////////////
+    // Override in derived. Not virtual as template methods
+    /////////////////////////////////////////////////////////////
+    virtual void RedBlackSource  (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)                =0;
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)          =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)                           =0;
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)=0;
+
+  };
+
+  template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) 
+      :    SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) 
+    {
+    }
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field   sol_e(grid);
+      Field   src_e(grid);
+
+      src_e = src_e_c; // Const correctness
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
+      src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+  template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal has Mooee on it.
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+      : SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
+
+
+    //////////////////////////////////////////////////////
+    // Override RedBlack specialisation
+    //////////////////////////////////////////////////////
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      // get the right MpcDag
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+
+    }
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   tmp(grid);
+      Field  sol_e(grid);
+      Field  src_e_i(grid);
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o,tmp);          assert(  tmp.Checkerboard()   ==Even);
+      src_e_i = src_e-tmp;               assert(  src_e_i.Checkerboard() ==Even);
+      _Matrix.MooeeInv(src_e_i,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e); assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o); assert(  sol_o.Checkerboard() ==Odd );
+    }
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Site diagonal is identity, right preconditioned by Mee^inv
+  // ( 1 - Meo Moo^inv Moe Mee^inv  ) phi =( 1 - Meo Moo^inv Moe Mee^inv  ) Mee psi =  = eta  = eta
+  //=> psi = MeeInv phi
+  ///////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
+  public:
+    typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
+
+    /////////////////////////////////////////////////////
+    // Wrap the usual normal equations Schur trick
+    /////////////////////////////////////////////////////
+  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)  
+    : SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};
+
+    virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      
+      Field   tmp(grid);
+      Field  Mtmp(grid);
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd ,src_o,src);
+    
+      /////////////////////////////////////////////////////
+      // src_o = Mdag * (source_o - Moe MeeInv source_e)
+      /////////////////////////////////////////////////////
+      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
+      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
+      tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
+
+      // get the right MpcDag
+      _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
+    }
+
+    virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
+    {
+      GridBase *grid = _Matrix.RedBlackGrid();
+      GridBase *fgrid= _Matrix.Grid();
+
+      Field   sol_o_i(grid);
+      Field   tmp(grid);
+      Field   sol_e(grid);
+
+      ////////////////////////////////////////////////
+      // MooeeInv due to pecond
+      ////////////////////////////////////////////////
+      _Matrix.MooeeInv(sol_o,tmp);
+      sol_o_i = tmp;
+
+      ///////////////////////////////////////////////////
+      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
+      ///////////////////////////////////////////////////
+      _Matrix.Meooe(sol_o_i,tmp);    assert(  tmp.Checkerboard()   ==Even);
+      tmp = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
+      _Matrix.MooeeInv(tmp,sol_e);   assert(  sol_e.Checkerboard() ==Even);
+     
+      setCheckerboard(sol,sol_e);    assert(  sol_e.Checkerboard() ==Even);
+      setCheckerboard(sol,sol_o_i);  assert(  sol_o_i.Checkerboard() ==Odd );
+    };
+
+    virtual void RedBlackSolve   (Matrix & _Matrix,const Field &src_o, Field &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
+    };
+    virtual void RedBlackSolve   (Matrix & _Matrix,const std::vector<Field> &src_o,  std::vector<Field> &sol_o)
+    {
+      SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
+      this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); 
+    }
+  };
+}
+#endif
diff --git a/lib/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
similarity index 100%
rename from lib/allocator/AlignedAllocator.cc
rename to Grid/allocator/AlignedAllocator.cc
diff --git a/lib/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
similarity index 99%
rename from lib/allocator/AlignedAllocator.h
rename to Grid/allocator/AlignedAllocator.h
index f55fe71b..ed1fbec2 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -219,7 +219,6 @@ public:
     if ( __freeme ) free((void *)__freeme);
   #endif
 #endif
-
   }
   void construct(pointer __p, const _Tp& __val) { };
   void construct(pointer __p) { };
diff --git a/lib/cartesian/Cartesian.h b/Grid/cartesian/Cartesian.h
similarity index 100%
rename from lib/cartesian/Cartesian.h
rename to Grid/cartesian/Cartesian.h
diff --git a/lib/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h
similarity index 99%
rename from lib/cartesian/Cartesian_base.h
rename to Grid/cartesian/Cartesian_base.h
index 813ea548..76abe0ee 100644
--- a/lib/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -60,6 +60,7 @@ public:
 
   virtual ~GridBase() = default;
 
+
   // Physics Grid information.
   Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
   Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -79,6 +80,8 @@ public:
   Coordinate _lstart;     // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
   Coordinate _lend  ;     // local end of array in gcoors   _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
 
+    bool _isCheckerBoarded; 
+
 public:
 
   ////////////////////////////////////////////////////////////////
diff --git a/lib/cartesian/Cartesian_full.h b/Grid/cartesian/Cartesian_full.h
similarity index 97%
rename from lib/cartesian/Cartesian_full.h
rename to Grid/cartesian/Cartesian_full.h
index 5aa0747b..c083817b 100644
--- a/lib/cartesian/Cartesian_full.h
+++ b/Grid/cartesian/Cartesian_full.h
@@ -96,6 +96,7 @@ public:
     ///////////////////////
     // Grid information
     ///////////////////////
+      _isCheckerBoarded = false;
     _ndimension = dimensions.size();
 
     _fdimensions.resize(_ndimension);
@@ -121,6 +122,7 @@ public:
 
         // Use a reduced simd grid
         _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
         assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 
         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
@@ -165,6 +167,7 @@ public:
         block = block * _rdimensions[d];
       }
   };
+
 };
 
 NAMESPACE_END(Grid);
diff --git a/lib/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h
similarity index 99%
rename from lib/cartesian/Cartesian_red_black.h
rename to Grid/cartesian/Cartesian_red_black.h
index 3cde6b7f..34f763d2 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/Grid/cartesian/Cartesian_red_black.h
@@ -140,9 +140,8 @@ public:
 	    const Coordinate &checker_dim_mask,
 	    int checker_dim)
   {
-    ///////////////////////
-    // Grid information
-    ///////////////////////
+
+      _isCheckerBoarded = true;
     _checker_dim = checker_dim;
     assert(checker_dim_mask[checker_dim] == 1);
     _ndimension = dimensions.size();
diff --git a/lib/communicator/Communicator.h b/Grid/communicator/Communicator.h
similarity index 100%
rename from lib/communicator/Communicator.h
rename to Grid/communicator/Communicator.h
diff --git a/lib/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc
similarity index 100%
rename from lib/communicator/Communicator_base.cc
rename to Grid/communicator/Communicator_base.cc
diff --git a/lib/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
similarity index 99%
rename from lib/communicator/Communicator_base.h
rename to Grid/communicator/Communicator_base.h
index 0eebd305..11dbfcbb 100644
--- a/lib/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -83,6 +83,7 @@ private:
 
 public:
   
+  
   ////////////////////////////////////////////////////////////////////////////////////////
   // Wraps MPI_Cart routines, or implements equivalent on other impls
   ////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
similarity index 92%
rename from lib/communicator/Communicator_mpi3.cc
rename to Grid/communicator/Communicator_mpi3.cc
index c13bc2e4..4d02c7b9 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -44,11 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    //assert (provided == MPI_THREAD_MULTIPLE);
+    //    assert (provided == MPI_THREAD_MULTIPLE);
+    //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
+    if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
+        (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) ) {
+      assert(0);
+    }
   }
 
-  Grid_quiesce_nodes();
-
+  // Never clean up as done once.
   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
 
   GlobalSharedMemory::Init(communicator_world);
@@ -84,9 +88,17 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
 CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) 
 {
   MPI_Comm optimal_comm;
-  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine
+  ////////////////////////////////////////////////////
+  // Remap using the shared memory optimising routine
+  // The remap creates a comm which must be freed
+  ////////////////////////////////////////////////////
+  GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm);
   InitFromMPICommunicator(processors,optimal_comm);
   SetCommunicator(optimal_comm);
+  ///////////////////////////////////////////////////
+  // Free the temp communicator
+  ///////////////////////////////////////////////////
+  MPI_Comm_free(&optimal_comm);
 }
 
 //////////////////////////////////
@@ -111,10 +123,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
   // split the communicator
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   //  int Nparent = parent._processors ; 
-  //  std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
   int Nparent;
   MPI_Comm_size(parent.communicator,&Nparent);
-  //  std::cout << " Parent size  "<<Nparent <<std::endl;
 
   int childsize=1;
   for(int d=0;d<processors.size();d++) {
@@ -123,8 +133,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
   int Nchild = Nparent/childsize;
   assert (childsize * Nchild == Nparent);
 
-  //  std::cout << " child size  "<<childsize <<std::endl;
-
   Coordinate ccoor(_ndimension); // coor within subcommunicator
   Coordinate scoor(_ndimension); // coor of split within parent
   Coordinate ssize(_ndimension); // coor of split within parent
@@ -152,8 +160,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
 
   } else {
     srank = 0;
-    comm_split    = parent.communicator;
-    //    std::cout << " Inherited communicator " <<comm_split <<std::endl;
+    int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
+    assert(ierr==0);
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -166,6 +174,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   SetCommunicator(comm_split);
 
+  ///////////////////////////////////////////////
+  // Free the temp communicator 
+  ///////////////////////////////////////////////
+  MPI_Comm_free(&comm_split);
+
   if(0){ 
     std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
     for(int d=0;d<processors.size();d++){
@@ -179,6 +192,9 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
 
 void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
 {
+  ////////////////////////////////////////////////////
+  // Creates communicator, and the communicator_halo
+  ////////////////////////////////////////////////////
   _ndimension = processors.size();
   _processor_coor.resize(_ndimension);
 
diff --git a/lib/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
similarity index 100%
rename from lib/communicator/Communicator_none.cc
rename to Grid/communicator/Communicator_none.cc
diff --git a/lib/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc
similarity index 100%
rename from lib/communicator/SharedMemory.cc
rename to Grid/communicator/SharedMemory.cc
diff --git a/lib/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h
similarity index 99%
rename from lib/communicator/SharedMemory.h
rename to Grid/communicator/SharedMemory.h
index 1c3953d0..fc9366b2 100644
--- a/lib/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@@ -123,6 +123,7 @@ protected:
 
 public:
   SharedMemory() {};
+  ~SharedMemory();
   ///////////////////////////////////////////////////////////////////////////////////////
   // set the buffers & sizes
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
similarity index 65%
rename from lib/communicator/SharedMemoryMPI.cc
rename to Grid/communicator/SharedMemoryMPI.cc
index 44939332..0f7d929b 100644
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 
 #include <Grid/GridCore.h>
+#include <pwd.h>
 
 #ifdef GRID_NVCC
 #include <cuda_runtime_api.h>
@@ -120,19 +121,150 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
   assert(WorldNode!=-1);
   _ShmSetup=1;
 }
-
-void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+// Gray encode support 
+int BinaryToGray (int  binary) {
+  int gray = (binary>>1)^binary;
+  return gray;
+}
+int Log2Size(int TwoToPower,int MAXLOG2)
 {
-  ////////////////////////////////////////////////////////////////
-  // Assert power of two shm_size.
-  ////////////////////////////////////////////////////////////////
   int log2size = -1;
-  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
-    if ( (0x1<<i) == WorldShmSize ) {
+  for(int i=0;i<=MAXLOG2;i++){
+    if ( (0x1<<i) == TwoToPower ) {
       log2size = i;
       break;
     }
   }
+  return log2size;
+}
+void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
+{
+#ifdef HYPERCUBE
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
+  assert(log2size != -1);
+
+  ////////////////////////////////////////////////////////////////
+  // Identify the hypercube coordinate of this node using hostname
+  ////////////////////////////////////////////////////////////////
+  // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits
+  // i runs 0..7                                    3 bits
+  // r runs 0..3                                    2 bits
+  // 2^10 = 1024 nodes
+  const int maxhdim = 10; 
+  std::vector<int> HyperCubeCoords(maxhdim,0);
+  std::vector<int> RootHyperCubeCoords(maxhdim,0);
+  int R;
+  int I;
+  int N;
+  const int namelen = _POSIX_HOST_NAME_MAX;
+  char name[namelen];
+
+  // Parse ICE-XA hostname to get hypercube location
+  gethostname(name,namelen);
+  int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
+  assert(nscan==3);
+
+  int nlo = N%9;
+  int nhi = N/9;
+  uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
+  uint32_t rootcoor  = hypercoor;
+
+  //////////////////////////////////////////////////////////////////
+  // Print debug info
+  //////////////////////////////////////////////////////////////////
+  for(int d=0;d<maxhdim;d++){
+    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
+  }
+
+  std::string hname(name);
+  std::cout << "hostname "<<hname<<std::endl;
+  std::cout << "R " << R << " I " << I << " N "<< N
+            << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
+
+  //////////////////////////////////////////////////////////////////
+  // broadcast node 0's base coordinate for this partition.
+  //////////////////////////////////////////////////////////////////
+  MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); 
+  hypercoor=hypercoor-rootcoor;
+  assert(hypercoor<WorldSize);
+  assert(hypercoor>=0);
+
+  //////////////////////////////////////
+  // Printing
+  //////////////////////////////////////
+  for(int d=0;d<maxhdim;d++){
+    HyperCubeCoords[d] = (hypercoor>>d)&0x1;
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Identify subblock of ranks on node spreading across dims
+  // in a maximally symmetrical way
+  ////////////////////////////////////////////////////////////////
+  int ndimension              = processors.size();
+  std::vector<int> processor_coor(ndimension);
+  std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension);
+  std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension);
+  std::vector<int> HyperCoor(ndimension);
+  int dim = 0;
+  for(int l2=0;l2<log2size;l2++){
+    while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
+    ShmDims[dim]*=2;
+    dim=(dim+1)%ndimension;
+    }
+
+  ////////////////////////////////////////////////////////////////
+  // Establish torus of processes and nodes with sub-blockings
+  ////////////////////////////////////////////////////////////////
+  for(int d=0;d<ndimension;d++){
+    NodeDims[d] = WorldDims[d]/ShmDims[d];
+  }
+  ////////////////////////////////////////////////////////////////
+  // Map Hcube according to physical lattice 
+  // must partition. Loop over dims and find out who would join.
+  ////////////////////////////////////////////////////////////////
+  int hcoor = hypercoor;
+  for(int d=0;d<ndimension;d++){
+     int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
+     int msk  = (0x1<<bits)-1;
+     HyperCoor[d]=hcoor & msk;  
+     HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
+     hcoor = hcoor >> bits;
+  }
+  ////////////////////////////////////////////////////////////////
+  // Check processor counts match
+  ////////////////////////////////////////////////////////////////
+  int Nprocessors=1;
+  for(int i=0;i<ndimension;i++){
+    Nprocessors*=processors[i];
+  }
+  assert(WorldSize==Nprocessors);
+
+  ////////////////////////////////////////////////////////////////
+  // Establish mapping between lexico physics coord and WorldRank
+  ////////////////////////////////////////////////////////////////
+  int rank;
+
+  Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims);
+
+  for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
+
+  Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
+  for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
+  Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
+
+  /////////////////////////////////////////////////////////////////
+  // Build the new communicator
+  /////////////////////////////////////////////////////////////////
+  int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
+  assert(ierr==0);
+#else 
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
   assert(log2size != -1);
 
   ////////////////////////////////////////////////////////////////
@@ -181,7 +313,69 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
   /////////////////////////////////////////////////////////////////
   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
   assert(ierr==0);
+#endif
 }
+////////////////////////////////////////////////////////////////////////////////////////////
+// SHMGET
+////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMGET
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  std::vector<int> shmids(WorldShmSize);
+
+  if ( WorldShmRank == 0 ) {
+    for(int r=0;r<WorldShmSize;r++){
+      size_t size = bytes;
+      key_t key   = IPC_PRIVATE;
+      int flags = IPC_CREAT | SHM_R | SHM_W;
+#ifdef SHM_HUGETLB
+      if (Hugepages) flags|=SHM_HUGETLB;
+#endif
+      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
+        int errsv = errno;
+        printf("Errno %d\n",errsv);
+        printf("key   %d\n",key);
+        printf("size  %lld\n",size);
+        printf("flags %d\n",flags);
+        perror("shmget");
+        exit(1);
+      }
+    }
+  }
+  MPI_Barrier(WorldShmComm);
+  MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
+  MPI_Barrier(WorldShmComm);
+
+  for(int r=0;r<WorldShmSize;r++){
+    WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
+    if (WorldShmCommBufs[r] == (uint64_t *)-1) {
+      perror("Shared memory attach failure");
+      shmctl(shmids[r], IPC_RMID, NULL);
+      exit(2);
+    }
+  }
+  MPI_Barrier(WorldShmComm);
+  ///////////////////////////////////
+  // Mark for clean up
+  ///////////////////////////////////
+  for(int r=0;r<WorldShmSize;r++){
+    shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
+  }
+  MPI_Barrier(WorldShmComm);
+
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+}
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
@@ -272,6 +466,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
   assert(_ShmSetup==1);
   assert(_ShmAlloc==0);
   //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -281,7 +476,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   WorldShmCommBufs.resize(WorldShmSize);
   
   ////////////////////////////////////////////////////////////////////////////////////////////
-  // Hugetlbf and others map filesystems as mappable huge pages
+  // Hugetlbfs and others map filesystems as mappable huge pages
   ////////////////////////////////////////////////////////////////////////////////////////////
   char shm_name [NAME_MAX];
   for(int r=0;r<WorldShmSize;r++){
@@ -308,6 +503,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     assert(((uint64_t)ptr&0x3F)==0);
     close(fd);
     WorldShmCommBufs[r] =ptr;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+};
+#endif // MMAP
+
+#ifdef GRID_MPI3_SHM_NONE
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  assert(WorldShmSize == 1);
+  for(int r=0;r<WorldShmSize;r++){
+    
+    int fd=-1;
+    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    close(fd);
+    WorldShmCommBufs[r] =ptr;
+    //    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
   }
   _ShmAlloc=1;
   _ShmAllocBytes  = bytes;
@@ -322,6 +560,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 { 
+  std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
   assert(_ShmSetup==1);
   assert(_ShmAlloc==0); 
   MPI_Barrier(WorldShmComm);
@@ -333,7 +572,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	
       size_t size = bytes;
       
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      struct passwd *pw = getpwuid (getuid());
+      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
       
       shm_unlink(shm_name);
       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
@@ -349,7 +589,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #endif
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
       
-      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      //      std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
+      if ( ptr == (void * )MAP_FAILED ) {       
+	perror("failed mmap");     
+	assert(0);    
+      }
       assert(((uint64_t)ptr&0x3F)==0);
       
       WorldShmCommBufs[r] =ptr;
@@ -364,7 +608,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 
       size_t size = bytes ;
       
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      struct passwd *pw = getpwuid (getuid());
+      sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
       
       int fd=shm_open(shm_name,O_RDWR,0666);
       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      }
@@ -430,7 +675,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
 
     uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
 
-    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,comm);
+    MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
 
     ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
   }
@@ -504,6 +749,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
     return (void *) remote;
   }
 }
+SharedMemory::~SharedMemory()
+{
+  int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised);
+  if ( !MPI_is_finalised ) { 
+    MPI_Comm_free(&ShmComm);
+  }
+};
 
 NAMESPACE_END(Grid); 
 
diff --git a/lib/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc
similarity index 99%
rename from lib/communicator/SharedMemoryNone.cc
rename to Grid/communicator/SharedMemoryNone.cc
index ba705112..ed37ab47 100644
--- a/lib/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -122,6 +122,8 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
   return NULL;
 }
+SharedMemory::~SharedMemory()
+{};
 
 NAMESPACE_END(Grid); 
 
diff --git a/lib/cshift/Cshift.h b/Grid/cshift/Cshift.h
similarity index 100%
rename from lib/cshift/Cshift.h
rename to Grid/cshift/Cshift.h
diff --git a/lib/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h
similarity index 83%
rename from lib/cshift/Cshift_common.h
rename to Grid/cshift/Cshift_common.h
index 8c429723..df606657 100644
--- a/lib/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -23,10 +23,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef _GRID_CSHIFT_COMMON_H_
-#define _GRID_CSHIFT_COMMON_H_
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -45,40 +44,44 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
   int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane 
   int e1=rhs.Grid()->_slice_nblock[dimension];
   int e2=rhs.Grid()->_slice_block[dimension];
+  int ent = 0;
 
+  static Vector<std::pair<int,int> > table; table.resize(e1*e2);
   int stride=rhs.Grid()->_slice_stride[dimension];
+
   auto rhs_v = rhs.View();
   if ( cbmask == 0x3 ) { 
-    thread_loop_collapse2( (int n=0;n<e1;n++) , 
-      for(int b=0;b<e2;b++){
-	int o  = n*stride;
-	int bo = n*e2;
-	buffer[off+bo+b]=rhs_v[so+o+b];
-      }
-    );
-  } else { 
-    int bo=0;
-    std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o  = n*stride;
-	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
-	if ( ocb &cbmask ) {
-	  table.push_back(std::pair<int,int> (bo++,o+b));
-	}
+	int bo = n*e2;
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
       }
     }
-    thread_loop( (int i=0;i<table.size();i++),{
-      buffer[off+table[i].first]=rhs_v[so+table[i].second];
-    });
+  } else { 
+     int bo=0;
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*stride;
+	 int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
+	 if ( ocb &cbmask ) {
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
+	 }
+       }
+     }
   }
+  thread_loop( (int i=0;i<ent;i++),{
+    buffer[table[i].first]=rhs_v[table[i].second];
+  });
 }
 
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_extract(const Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask)
+Gather_plane_extract(const Lattice<vobj> &rhs,
+		     ExtractPointerArray<typename vobj::scalar_object> pointers,
+		     int dimension,int plane,int cbmask)
 {
   int rd = rhs.Grid()->_rdimensions[dimension];
 
@@ -102,7 +105,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,ExtractPointerArray<typename vobj:
 	
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
-
       }
     });
   } else { 
@@ -142,31 +144,37 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
   int e1=rhs.Grid()->_slice_nblock[dimension];
   int e2=rhs.Grid()->_slice_block[dimension];
   int stride=rhs.Grid()->_slice_stride[dimension];
-  auto rhs_v = rhs.View();
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent    =0;
+
   if ( cbmask ==0x3 ) {
-    thread_loop_collapse2( (int n=0;n<e1;n++),{
+
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int bo  =n*rhs.Grid()->_slice_block[dimension];
-	rhs_v[so+o+b]=buffer[bo+b];
+	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
       }
-    });
+    }
+
   } else { 
-    std::vector<std::pair<int,int> > table;
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table.push_back(std::pair<int,int> (so+o+b,bo++));
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
       }
     }
-    thread_loop( (int i=0;i<table.size();i++),{
-      rhs_v[table[i].first]=buffer[table[i].second];
-    });
   }
+  
+  auto rhs_v = rhs.View();
+  thread_loop( (int i=0;i<ent;i++), {
+    rhs_v[table[i].first]=buffer[table[i].second];
+  });
 }
 
 //////////////////////////////////////////////////////
@@ -185,8 +193,8 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
   int e1=rhs.Grid()->_slice_nblock[dimension];
   int e2=rhs.Grid()->_slice_block[dimension];
 
-  auto rhs_v = rhs.View();
   if(cbmask ==0x3 ) {
+    auto rhs_v = rhs.View();
     thread_loop_collapse2( (int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
 	int o      = n*rhs.Grid()->_slice_stride[dimension];
@@ -198,9 +206,9 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
 
     // Case of SIMD split AND checker dim cannot currently be hit, except in 
     // Test_cshift_red_black code.
-    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;
-    // think this is buggy FIXME
+    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
+    auto rhs_v = rhs.View();
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o      = n*rhs.Grid()->_slice_stride[dimension];
@@ -225,40 +233,44 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
     cbmask=0x3;
   }
 
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
-
   int ro  = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane 
   int lo  = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane 
 
   int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
   int e2=rhs.Grid()->_slice_block[dimension];
   int stride = rhs.Grid()->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent=0;
+
   if(cbmask == 0x3 ){
-    thread_loop_collapse2((int n=0;n<e1;n++),{
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
         int o =n*stride+b;
-	vstream(lhs_v[lo+o],rhs_v[ro+o]);
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
       }
-    });
+    }
   } else { 
-    thread_loop_collapse2( (int n=0;n<e1;n++),{
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
         int o =n*stride+b;
         int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
         if ( ocb&cbmask ) {
-	  vstream(lhs_v[lo+o],rhs_v[ro+o]);
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
       }
-    });
+    }
   }
-  
+
+  auto rhs_v = rhs.View();
+  auto lhs_v = lhs.View();
+  thread_loop( (int i=0;i<ent;i++),{
+    lhs_v[table[i].first]=rhs_v[table[i].second];
+  });
+
 }
 
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
  
   int rd = rhs.Grid()->_rdimensions[dimension];
 
@@ -273,15 +285,29 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
   int e2=rhs.Grid()->_slice_block [dimension];
   int stride = rhs.Grid()->_slice_stride[dimension];
 
-  thread_loop_collapse2( (int n=0;n<e1;n++),{
-    for(int b=0;b<e2;b++){
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  int ent=0;
 
+  double t_tab,t_perm;
+  if ( cbmask == 0x3 ) {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  } else {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
       int o  =n*stride;
       int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
-	permute(lhs_v[lo+o+b],rhs_v[ro+o+b],permute_type);
-      }
-    }
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  }
+
+  auto rhs_v = rhs.View();
+  auto lhs_v = lhs.View();
+  thread_loop( (int i=0;i<ent;i++),{
+    permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
   });
 }
 
@@ -295,6 +321,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
 
+  double t_local;
+  
   if ( sshift[0] == sshift[1] ) {
     Cshift_local(ret,rhs,dimension,shift,0x3);
   } else {
@@ -323,17 +351,13 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
 
   for(int x=0;x<rd;x++){       
 
-    //    int o   = 0;
+    int o   = 0;
     int bo  = x * grid->_ostride[dimension];
     int cb= (cbmask==0x2)? Odd : Even;
 
     int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
     int sx     = (x+sshift)%rd;
-
-    // FIXME : This must change where we have a 
-    // Rotate slice.
     
-    // Document how this works ; why didn't I do this when I first wrote it...
     // wrap is whether sshift > rd.
     //  num is sshift mod rd.
     // 
@@ -369,11 +393,8 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
 
     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
     else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
   
   }
 }
-
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/lib/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
similarity index 92%
rename from lib/cshift/Cshift_mpi.h
rename to Grid/cshift/Cshift_mpi.h
index 1ee2233c..0f0e80b1 100644
--- a/lib/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -54,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
 
 
   if ( !comm_dim ) {
-    //    std::cout << "Cshift_local" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
   } else if ( splice_dim ) {
-    //    std::cout << "Cshift_comms_simd" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
     Cshift_comms_simd(ret,rhs,dimension,shift);
   } else {
-    //    std::cout << "Cshift_comms" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
     Cshift_comms(ret,rhs,dimension,shift);
   }
   return ret;
@@ -91,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
 
+  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
   if ( sshift[0] == sshift[1] ) {
+    //std::cout << "Single pass Cshift_comms" <<std::endl;
     Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
   } else {
+    //std::cout << "Two pass Cshift_comms" <<std::endl;
     Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
   }
@@ -175,6 +178,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   int simd_layout     = grid->_simd_layout[dimension];
   int comm_dim        = grid->_processors[dimension] >1 ;
 
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
   assert(comm_dim==1);
   assert(simd_layout==2);
   assert(shift>=0);
diff --git a/lib/cshift/Cshift_none.h b/Grid/cshift/Cshift_none.h
similarity index 100%
rename from lib/cshift/Cshift_none.h
rename to Grid/cshift/Cshift_none.h
diff --git a/lib/json/json.hpp b/Grid/json/json.hpp
similarity index 62%
rename from lib/json/json.hpp
rename to Grid/json/json.hpp
index 3f5c2b19..c8b0cc9e 100644
--- a/lib/json/json.hpp
+++ b/Grid/json/json.hpp
@@ -1,11 +1,12 @@
 /*
     __ _____ _____ _____
  __|  |   __|     |   | |  JSON for Modern C++
-|  |  |__   |  |  | | | |  version 2.1.1
+|  |  |__   |  |  | | | |  version 3.2.0
 |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 
 Licensed under the MIT License <http://opensource.org/licenses/MIT>.
-Copyright (c) 2013-2017 Niels Lohmann <http://nlohmann.me>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2013-2018 Niels Lohmann <http://nlohmann.me>.
 
 Permission is hereby  granted, free of charge, to any  person obtaining a copy
 of this software and associated  documentation files (the "Software"), to deal
@@ -29,42 +30,104 @@ SOFTWARE.
 #ifndef NLOHMANN_JSON_HPP
 #define NLOHMANN_JSON_HPP
 
-#include <algorithm> // all_of, copy, fill, find, for_each, generate_n, none_of, remove, reverse, transform
-#include <array> // array
+#define NLOHMANN_JSON_VERSION_MAJOR 3
+#define NLOHMANN_JSON_VERSION_MINOR 2
+#define NLOHMANN_JSON_VERSION_PATCH 0
+
+#include <algorithm> // all_of, find, for_each
 #include <cassert> // assert
 #include <ciso646> // and, not, or
-#include <clocale> // lconv, localeconv
-#include <cmath> // isfinite, labs, ldexp, signbit
 #include <cstddef> // nullptr_t, ptrdiff_t, size_t
-#include <cstdint> // int64_t, uint64_t
-#include <cstdlib> // abort, strtod, strtof, strtold, strtoul, strtoll, strtoull
-#include <cstring> // memcpy, strlen
-#include <forward_list> // forward_list
-#include <functional> // function, hash, less
+#include <functional> // hash, less
 #include <initializer_list> // initializer_list
-#include <iomanip> // hex
-#include <iosfwd>   // istream, ostream
-#include <iterator> // advance, begin, back_inserter, bidirectional_iterator_tag, distance, end, inserter, iterator, iterator_traits, next, random_access_iterator_tag, reverse_iterator
-#include <limits> // numeric_limits
-#include <locale> // locale
-#include <map> // map
-#include <memory> // addressof, allocator, allocator_traits, unique_ptr
+#include <iosfwd> // istream, ostream
+#include <iterator> // iterator_traits, random_access_iterator_tag
 #include <numeric> // accumulate
-#include <sstream> // stringstream
-#include <string> // getline, stoi, string, to_string
-#include <type_traits> // add_pointer, conditional, decay, enable_if, false_type, integral_constant, is_arithmetic, is_base_of, is_const, is_constructible, is_convertible, is_default_constructible, is_enum, is_floating_point, is_integral, is_nothrow_move_assignable, is_nothrow_move_constructible, is_pointer, is_reference, is_same, is_scalar, is_signed, remove_const, remove_cv, remove_pointer, remove_reference, true_type, underlying_type
-#include <utility> // declval, forward, make_pair, move, pair, swap
-#include <valarray> // valarray
+#include <string> // string, stoi, to_string
+#include <utility> // declval, forward, move, pair, swap
+
+// #include <nlohmann/json_fwd.hpp>
+#ifndef NLOHMANN_JSON_FWD_HPP
+#define NLOHMANN_JSON_FWD_HPP
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
 #include <vector> // vector
 
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer>
+class basic_json;
+
+/*!
+@brief JSON Pointer
+
+A JSON pointer defines a string syntax for identifying a specific value
+within a JSON document. It can be used with functions `at` and
+`operator[]`. Furthermore, JSON pointers are the base for JSON patches.
+
+@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)
+
+@since version 2.0.0
+*/
+template<typename BasicJsonType>
+class json_pointer;
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which
+uses the standard template types.
+
+@since version 1.0.0
+*/
+using json = basic_json<>;
+}
+
+#endif
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+// This file contains all internal macro definitions
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
 // exclude unsupported compilers
-#if defined(__clang__)
-    #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
-        #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
-    #endif
-#elif defined(__GNUC__)
-    #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40805
-        #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+    #if defined(__clang__)
+        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
     #endif
 #endif
 
@@ -90,14 +153,36 @@ SOFTWARE.
 #endif
 
 // allow to disable exceptions
-#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && not defined(JSON_NOEXCEPTION)
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
     #define JSON_THROW(exception) throw exception
     #define JSON_TRY try
     #define JSON_CATCH(exception) catch(exception)
+    #define JSON_INTERNAL_CATCH(exception) catch(exception)
 #else
     #define JSON_THROW(exception) std::abort()
     #define JSON_TRY if(true)
     #define JSON_CATCH(exception) if(false)
+    #define JSON_INTERNAL_CATCH(exception) if(false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+    #undef JSON_THROW
+    #define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+    #undef JSON_TRY
+    #define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+    #undef JSON_CATCH
+    #define JSON_CATCH JSON_CATCH_USER
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
 #endif
 
 // manual branch prediction
@@ -109,31 +194,16 @@ SOFTWARE.
     #define JSON_UNLIKELY(x)    x
 #endif
 
-/*!
-@brief namespace for Niels Lohmann
-@see https://github.com/nlohmann
-@since version 1.0.0
-*/
-namespace nlohmann
-{
-template<typename = void, typename = void>
-struct adl_serializer;
+// C++ language standard detection
+#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+    #define JSON_HAS_CPP_17
+    #define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+    #define JSON_HAS_CPP_14
+#endif
 
-// forward declaration of basic_json (required to split the class)
-template<template<typename U, typename V, typename... Args> class ObjectType =
-         std::map,
-         template<typename U, typename... Args> class ArrayType = std::vector,
-         class StringType = std::string, class BooleanType = bool,
-         class NumberIntegerType = std::int64_t,
-         class NumberUnsignedType = std::uint64_t,
-         class NumberFloatType = double,
-         template<typename U> class AllocatorType = std::allocator,
-         template<typename T, typename SFINAE = void> class JSONSerializer =
-         adl_serializer>
-class basic_json;
-
-// Ugly macros to avoid uglier copy-paste when specializing basic_json
-// This is only temporary and will be removed in 3.0
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
 
 #define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
     template<template<typename, typename, typename...> class ObjectType,   \
@@ -148,17 +218,409 @@ class basic_json;
     NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
     AllocatorType, JSONSerializer>
 
+// #include <nlohmann/detail/meta/cpp_future.hpp>
 
+
+#include <ciso646> // not
+#include <cstddef> // size_t
+#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+
+namespace nlohmann
+{
+namespace detail
+{
+// alias templates to reduce boilerplate
+template<bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template<typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+// implementation of C++14 index_sequence and affiliates
+// source: https://stackoverflow.com/a/32223343
+template<std::size_t... Ints>
+struct index_sequence
+{
+    using type = index_sequence;
+    using value_type = std::size_t;
+    static constexpr std::size_t size() noexcept
+    {
+        return sizeof...(Ints);
+    }
+};
+
+template<class Sequence1, class Sequence2>
+struct merge_and_renumber;
+
+template<std::size_t... I1, std::size_t... I2>
+struct merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
+        : index_sequence < I1..., (sizeof...(I1) + I2)... > {};
+
+template<std::size_t N>
+struct make_index_sequence
+    : merge_and_renumber < typename make_index_sequence < N / 2 >::type,
+      typename make_index_sequence < N - N / 2 >::type > {};
+
+template<> struct make_index_sequence<0> : index_sequence<> {};
+template<> struct make_index_sequence<1> : index_sequence<0> {};
+
+template<typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+// dispatch utility (taken from ranges-v3)
+template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
+template<> struct priority_tag<0> {};
+
+// taken from ranges-v3
+template<typename T>
+struct static_const
+{
+    static constexpr T value{};
+};
+
+template<typename T>
+constexpr T static_const<T>::value;
+}
+}
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+#include <ciso646> // not
+#include <limits> // numeric_limits
+#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility> // declval
+
+// #include <nlohmann/json_fwd.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template <typename ...Ts> struct make_void
+{
+    using type = void;
+};
+template <typename ...Ts> using void_t = typename make_void<Ts...>::type;
+}
+}
+
+
+// http://en.cppreference.com/w/cpp/experimental/is_detected
+namespace nlohmann
+{
+namespace detail
+{
+struct nonesuch
+{
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const&) = delete;
+    void operator=(nonesuch const&) = delete;
+};
+
+template <class Default,
+          class AlwaysVoid,
+          template <class...> class Op,
+          class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+}
+}
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
 /*!
-@brief unnamed namespace with internal helper functions
+@brief detail namespace with internal helper functions
 
-This namespace collects some functions that could not be defined inside the
-@ref basic_json class.
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
 
 @since version 2.1.0
 */
 namespace detail
 {
+/////////////
+// helpers //
+/////////////
+
+template<typename> struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template <typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template <typename T>
+using key_type_t = typename T::key_type;
+
+template <typename T>
+using value_type_t = typename T::value_type;
+
+template <typename T>
+using difference_type_t = typename T::difference_type;
+
+template <typename T>
+using pointer_t = typename T::pointer;
+
+template <typename T>
+using reference_t = typename T::reference;
+
+template <typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template <typename T>
+using iterator_t = typename T::iterator;
+
+template <typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template <typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template <typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+///////////////////
+// is_ functions //
+///////////////////
+
+template <typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template <typename T>
+struct is_iterator_traits<std::iterator_traits<T>>
+{
+  private:
+    using traits = std::iterator_traits<T>;
+
+  public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value &&
+        is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value &&
+        is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+// source: https://stackoverflow.com/a/37193089/4116453
+
+template <typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template <typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template <typename BasicJsonType, typename CompatibleObjectType,
+          typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl <
+    BasicJsonType, CompatibleObjectType,
+    enable_if_t<is_detected<mapped_type_t, CompatibleObjectType>::value and
+    is_detected<key_type_t, CompatibleObjectType>::value >>
+{
+
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        std::is_constructible<typename object_t::key_type,
+        typename CompatibleObjectType::key_type>::value and
+        std::is_constructible<typename object_t::mapped_type,
+        typename CompatibleObjectType::mapped_type>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type
+    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template <typename BasicJsonType, typename CompatibleStringType,
+          typename = void>
+struct is_compatible_string_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type_impl <
+    BasicJsonType, CompatibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, CompatibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type
+    : is_compatible_string_type_impl<BasicJsonType, CompatibleStringType> {};
+
+template <typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl <
+    BasicJsonType, CompatibleArrayType,
+    enable_if_t<is_detected<value_type_t, CompatibleArrayType>::value and
+    is_detected<iterator_t, CompatibleArrayType>::value >>
+{
+    // This is needed because json_reverse_iterator has a ::iterator type...
+    // Therefore it is detected as a CompatibleArrayType.
+    // The real fix would be to have an Iterable concept.
+    static constexpr bool value = not is_iterator_traits<std::iterator_traits<CompatibleArrayType>>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type
+    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType,
+          typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl <
+    RealIntegerType, CompatibleNumberIntegerType,
+    enable_if_t<std::is_integral<RealIntegerType>::value and
+    std::is_integral<CompatibleNumberIntegerType>::value and
+    not std::is_same<bool, CompatibleNumberIntegerType>::value >>
+{
+    // is there an assert somewhere on overflows?
+    using RealLimits = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value =
+        std::is_constructible<RealIntegerType,
+        CompatibleNumberIntegerType>::value and
+        CompatibleLimits::is_integer and
+        RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type
+    : is_compatible_integer_type_impl<RealIntegerType,
+      CompatibleNumberIntegerType> {};
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template <typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+template <typename BasicJsonType, typename T>
+struct has_from_json<BasicJsonType, T,
+           enable_if_t<not is_basic_json<T>::value>>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer,
+        const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template <typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_non_default_from_json<BasicJsonType, T, enable_if_t<not is_basic_json<T>::value>>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<T, from_json_function, serializer,
+        const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template <typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template <typename BasicJsonType, typename T>
+struct has_to_json<BasicJsonType, T, enable_if_t<not is_basic_json<T>::value>>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
+        T>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl: std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl <
+    BasicJsonType, CompatibleType,
+    enable_if_t<is_complete_type<CompatibleType>::value >>
+{
+    static constexpr bool value =
+        has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type
+    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+}
+}
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+
+#include <exception> // exception
+#include <stdexcept> // runtime_error
+#include <string> // to_string
+
+namespace nlohmann
+{
+namespace detail
+{
 ////////////////
 // exceptions //
 ////////////////
@@ -206,9 +668,9 @@ class exception : public std::exception
   protected:
     exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}
 
-    static std::string name(const std::string& ename, int id)
+    static std::string name(const std::string& ename, int id_)
     {
-        return "[json.exception." + ename + "." + std::to_string(id) + "] ";
+        return "[json.exception." + ename + "." + std::to_string(id_) + "] ";
     }
 
   private:
@@ -219,7 +681,7 @@ class exception : public std::exception
 /*!
 @brief exception indicating a parse error
 
-This excpetion is thrown by the library when a parse error occurs. Parse errors
+This exception is thrown by the library when a parse error occurs. Parse errors
 can occur during the deserialization of JSON text, CBOR, MessagePack, as well
 as when using JSON Patch.
 
@@ -235,12 +697,12 @@ json.exception.parse_error.102 | parse error at 14: missing or wrong low surroga
 json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid.
 json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects.
 json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors.
-json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number wihtout a leading `0`.
+json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`.
 json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character.
 json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences.
 json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number.
 json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read.
-json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xf8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
+json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
 json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read.
 
 @note For an input with n bytes, 1 is the index of the first character and n+1
@@ -264,18 +726,18 @@ class parse_error : public exception
   public:
     /*!
     @brief create a parse error exception
-    @param[in] id        the id of the exception
+    @param[in] id_       the id of the exception
     @param[in] byte_     the byte index where the error occurred (or 0 if the
                          position cannot be determined)
     @param[in] what_arg  the explanatory string
     @return parse_error object
     */
-    static parse_error create(int id, std::size_t byte_, const std::string& what_arg)
+    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg)
     {
-        std::string w = exception::name("parse_error", id) + "parse error" +
+        std::string w = exception::name("parse_error", id_) + "parse error" +
                         (byte_ != 0 ? (" at " + std::to_string(byte_)) : "") +
                         ": " + what_arg;
-        return parse_error(id, byte_, w.c_str());
+        return parse_error(id_, byte_, w.c_str());
     }
 
     /*!
@@ -334,10 +796,10 @@ caught.,invalid_iterator}
 class invalid_iterator : public exception
 {
   public:
-    static invalid_iterator create(int id, const std::string& what_arg)
+    static invalid_iterator create(int id_, const std::string& what_arg)
     {
-        std::string w = exception::name("invalid_iterator", id) + what_arg;
-        return invalid_iterator(id, w.c_str());
+        std::string w = exception::name("invalid_iterator", id_) + what_arg;
+        return invalid_iterator(id_, w.c_str());
     }
 
   private:
@@ -370,6 +832,7 @@ json.exception.type_error.312 | cannot use update() with string | The @ref updat
 json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined.
 json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers.
 json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive.
+json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. |
 
 @liveexample{The following code shows how a `type_error` exception can be
 caught.,type_error}
@@ -385,10 +848,10 @@ caught.,type_error}
 class type_error : public exception
 {
   public:
-    static type_error create(int id, const std::string& what_arg)
+    static type_error create(int id_, const std::string& what_arg)
     {
-        std::string w = exception::name("type_error", id) + what_arg;
-        return type_error(id, w.c_str());
+        std::string w = exception::name("type_error", id_) + what_arg;
+        return type_error(id_, w.c_str());
     }
 
   private:
@@ -412,6 +875,8 @@ json.exception.out_of_range.403 | key 'foo' not found | The provided key was not
 json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved.
 json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value.
 json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF.
+json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON only supports integers numbers up to 9223372036854775807. |
+json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. |
 
 @liveexample{The following code shows how an `out_of_range` exception can be
 caught.,out_of_range}
@@ -428,10 +893,10 @@ caught.,out_of_range}
 class out_of_range : public exception
 {
   public:
-    static out_of_range create(int id, const std::string& what_arg)
+    static out_of_range create(int id_, const std::string& what_arg)
     {
-        std::string w = exception::name("out_of_range", id) + what_arg;
-        return out_of_range(id, w.c_str());
+        std::string w = exception::name("out_of_range", id_) + what_arg;
+        return out_of_range(id_, w.c_str());
     }
 
   private:
@@ -449,7 +914,6 @@ Exceptions have ids 5xx.
 name / id                      | example message | description
 ------------------------------ | --------------- | -------------------------
 json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed.
-json.exception.other_error.502 | invalid object size for conversion | Some conversions to user-defined types impose constraints on the object size (e.g. std::pair)
 
 @sa @ref exception for the base class of the library exceptions
 @sa @ref parse_error for exceptions indicating a parse error
@@ -466,18 +930,30 @@ caught.,other_error}
 class other_error : public exception
 {
   public:
-    static other_error create(int id, const std::string& what_arg)
+    static other_error create(int id_, const std::string& what_arg)
     {
-        std::string w = exception::name("other_error", id) + what_arg;
-        return other_error(id, w.c_str());
+        std::string w = exception::name("other_error", id_) + what_arg;
+        return other_error(id_, w.c_str());
     }
 
   private:
     other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
 };
+}
+}
+
+// #include <nlohmann/detail/value_t.hpp>
 
 
+#include <array> // array
+#include <ciso646> // and
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t
 
+namespace nlohmann
+{
+namespace detail
+{
 ///////////////////////////
 // JSON type enumeration //
 ///////////////////////////
@@ -506,7 +982,7 @@ value with the default value for a given type
 
 @since version 1.0.0
 */
-enum class value_t : uint8_t
+enum class value_t : std::uint8_t
 {
     null,             ///< null value
     object,           ///< object (unordered set of name/value pairs)
@@ -525,562 +1001,70 @@ enum class value_t : uint8_t
 Returns an ordering that is similar to Python:
 - order: null < boolean < number < object < array < string
 - furthermore, each type is not smaller than itself
+- discarded values are not comparable
 
 @since version 1.0.0
 */
 inline bool operator<(const value_t lhs, const value_t rhs) noexcept
 {
-    static constexpr std::array<uint8_t, 8> order = {{
-            0, // null
-            3, // object
-            4, // array
-            5, // string
-            1, // boolean
-            2, // integer
-            2, // unsigned
-            2, // float
+    static constexpr std::array<std::uint8_t, 8> order = {{
+            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
+            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */
         }
     };
 
-    // discarded values are not comparable
-    return lhs != value_t::discarded and rhs != value_t::discarded and
-           order[static_cast<std::size_t>(lhs)] < order[static_cast<std::size_t>(rhs)];
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+    return l_index < order.size() and r_index < order.size() and order[l_index] < order[r_index];
+}
+}
 }
 
+// #include <nlohmann/detail/conversions/from_json.hpp>
 
-/////////////
-// helpers //
-/////////////
 
-template<typename> struct is_basic_json : std::false_type {};
+#include <algorithm> // transform
+#include <array> // array
+#include <ciso646> // and, not
+#include <forward_list> // forward_list
+#include <iterator> // inserter, front_inserter, end
+#include <map> // map
+#include <string> // string
+#include <tuple> // tuple, make_tuple
+#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map> // unordered_map
+#include <utility> // pair, declval
+#include <valarray> // valarray
 
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+// #include <nlohmann/detail/exceptions.hpp>
 
-// alias templates to reduce boilerplate
-template<bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
+// #include <nlohmann/detail/macro_scope.hpp>
 
-template<typename T>
-using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+// #include <nlohmann/detail/meta/cpp_future.hpp>
 
-// implementation of C++14 index_sequence and affiliates
-// source: https://stackoverflow.com/a/32223343
-template<std::size_t... Ints>
-struct index_sequence
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
 {
-    using type = index_sequence;
-    using value_type = std::size_t;
-    static constexpr std::size_t size() noexcept
-    {
-        return sizeof...(Ints);
-    }
-};
-
-template<class Sequence1, class Sequence2>
-struct merge_and_renumber;
-
-template<std::size_t... I1, std::size_t... I2>
-struct merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
-        : index_sequence < I1..., (sizeof...(I1) + I2)... >
-          {};
-
-template<std::size_t N>
-struct make_index_sequence
-    : merge_and_renumber < typename make_index_sequence < N / 2 >::type,
-      typename make_index_sequence < N - N / 2 >::type >
-{};
-
-template<> struct make_index_sequence<0> : index_sequence<> { };
-template<> struct make_index_sequence<1> : index_sequence<0> { };
-
-template<typename... Ts>
-using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
-
-/*
-Implementation of two C++17 constructs: conjunction, negation. This is needed
-to avoid evaluating all the traits in a condition
-
-For example: not std::is_same<void, T>::value and has_value_type<T>::value
-will not compile when T = void (on MSVC at least). Whereas
-conjunction<negation<std::is_same<void, T>>, has_value_type<T>>::value will
-stop evaluating if negation<...>::value == false
-
-Please note that those constructs must be used with caution, since symbols can
-become very long quickly (which can slow down compilation and cause MSVC
-internal compiler errors). Only use it when you have to (see example ahead).
-*/
-template<class...> struct conjunction : std::true_type {};
-template<class B1> struct conjunction<B1> : B1 {};
-template<class B1, class... Bn>
-struct conjunction<B1, Bn...> : std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
-
-template<class B> struct negation : std::integral_constant < bool, !B::value > {};
-
-// dispatch utility (taken from ranges-v3)
-template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
-template<> struct priority_tag<0> {};
-
-
-//////////////////
-// constructors //
-//////////////////
-
-template<value_t> struct external_constructor;
-
-template<>
-struct external_constructor<value_t::boolean>
+namespace detail
 {
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
-    {
-        j.m_type = value_t::boolean;
-        j.m_value = b;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::string>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
-    {
-        j.m_type = value_t::string;
-        j.m_value = s;
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
-    {
-        j.m_type = value_t::string;
-        j.m_value = std::move(s);
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::number_float>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
-    {
-        j.m_type = value_t::number_float;
-        j.m_value = val;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::number_unsigned>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
-    {
-        j.m_type = value_t::number_unsigned;
-        j.m_value = val;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::number_integer>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
-    {
-        j.m_type = value_t::number_integer;
-        j.m_value = val;
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::array>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
-    {
-        j.m_type = value_t::array;
-        j.m_value = arr;
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
-    {
-        j.m_type = value_t::array;
-        j.m_value = std::move(arr);
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType, typename CompatibleArrayType,
-             enable_if_t<not std::is_same<CompatibleArrayType,
-                                          typename BasicJsonType::array_t>::value,
-                         int> = 0>
-    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
-    {
-        using std::begin;
-        using std::end;
-        j.m_type = value_t::array;
-        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
-    {
-        j.m_type = value_t::array;
-        j.m_value = value_t::array;
-        j.m_value.array->reserve(arr.size());
-        for (bool x : arr)
-        {
-            j.m_value.array->push_back(x);
-        }
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType, typename T,
-             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
-    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
-    {
-        using std::begin;
-        using std::end;
-        j.m_type = value_t::array;
-        j.m_value = value_t::array;
-        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
-        j.assert_invariant();
-    }
-};
-
-template<>
-struct external_constructor<value_t::object>
-{
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
-    {
-        j.m_type = value_t::object;
-        j.m_value = obj;
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType>
-    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
-    {
-        j.m_type = value_t::object;
-        j.m_value = std::move(obj);
-        j.assert_invariant();
-    }
-
-    template<typename BasicJsonType, typename CompatibleObjectType,
-             enable_if_t<not std::is_same<CompatibleObjectType,
-                                          typename BasicJsonType::object_t>::value, int> = 0>
-    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
-    {
-        using std::begin;
-        using std::end;
-
-        j.m_type = value_t::object;
-        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
-        j.assert_invariant();
-    }
-};
-
-
-////////////////////////
-// has_/is_ functions //
-////////////////////////
-
-/*!
-@brief Helper to determine whether there's a key_type for T.
-
-This helper is used to tell associative containers apart from other containers
-such as sequence containers. For instance, `std::map` passes the test as it
-contains a `mapped_type`, whereas `std::vector` fails the test.
-
-@sa http://stackoverflow.com/a/7728728/266378
-@since version 1.0.0, overworked in version 2.0.6
-*/
-#define NLOHMANN_JSON_HAS_HELPER(type)                                        \
-    template<typename T> struct has_##type {                                  \
-    private:                                                                  \
-        template<typename U, typename = typename U::type>                     \
-        static int detect(U &&);                                              \
-        static void detect(...);                                              \
-    public:                                                                   \
-        static constexpr bool value =                                         \
-                std::is_integral<decltype(detect(std::declval<T>()))>::value; \
-    }
-
-NLOHMANN_JSON_HAS_HELPER(mapped_type);
-NLOHMANN_JSON_HAS_HELPER(key_type);
-NLOHMANN_JSON_HAS_HELPER(value_type);
-NLOHMANN_JSON_HAS_HELPER(iterator);
-
-#undef NLOHMANN_JSON_HAS_HELPER
-
-
-template<bool B, class RealType, class CompatibleObjectType>
-struct is_compatible_object_type_impl : std::false_type {};
-
-template<class RealType, class CompatibleObjectType>
-struct is_compatible_object_type_impl<true, RealType, CompatibleObjectType>
-{
-    static constexpr auto value =
-        std::is_constructible<typename RealType::key_type, typename CompatibleObjectType::key_type>::value and
-        std::is_constructible<typename RealType::mapped_type, typename CompatibleObjectType::mapped_type>::value;
-};
-
-template<class BasicJsonType, class CompatibleObjectType>
-struct is_compatible_object_type
-{
-    static auto constexpr value = is_compatible_object_type_impl <
-                                  conjunction<negation<std::is_same<void, CompatibleObjectType>>,
-                                  has_mapped_type<CompatibleObjectType>,
-                                  has_key_type<CompatibleObjectType>>::value,
-                                  typename BasicJsonType::object_t, CompatibleObjectType >::value;
-};
-
-template<typename BasicJsonType, typename T>
-struct is_basic_json_nested_type
-{
-    static auto constexpr value = std::is_same<T, typename BasicJsonType::iterator>::value or
-                                  std::is_same<T, typename BasicJsonType::const_iterator>::value or
-                                  std::is_same<T, typename BasicJsonType::reverse_iterator>::value or
-                                  std::is_same<T, typename BasicJsonType::const_reverse_iterator>::value;
-};
-
-template<class BasicJsonType, class CompatibleArrayType>
-struct is_compatible_array_type
-{
-    static auto constexpr value =
-        conjunction<negation<std::is_same<void, CompatibleArrayType>>,
-        negation<is_compatible_object_type<
-        BasicJsonType, CompatibleArrayType>>,
-        negation<std::is_constructible<typename BasicJsonType::string_t,
-        CompatibleArrayType>>,
-        negation<is_basic_json_nested_type<BasicJsonType, CompatibleArrayType>>,
-        has_value_type<CompatibleArrayType>,
-        has_iterator<CompatibleArrayType>>::value;
-};
-
-template<bool, typename, typename>
-struct is_compatible_integer_type_impl : std::false_type {};
-
-template<typename RealIntegerType, typename CompatibleNumberIntegerType>
-struct is_compatible_integer_type_impl<true, RealIntegerType, CompatibleNumberIntegerType>
-{
-    // is there an assert somewhere on overflows?
-    using RealLimits = std::numeric_limits<RealIntegerType>;
-    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
-
-    static constexpr auto value =
-        std::is_constructible<RealIntegerType, CompatibleNumberIntegerType>::value and
-        CompatibleLimits::is_integer and
-        RealLimits::is_signed == CompatibleLimits::is_signed;
-};
-
-template<typename RealIntegerType, typename CompatibleNumberIntegerType>
-struct is_compatible_integer_type
-{
-    static constexpr auto value =
-        is_compatible_integer_type_impl <
-        std::is_integral<CompatibleNumberIntegerType>::value and
-        not std::is_same<bool, CompatibleNumberIntegerType>::value,
-        RealIntegerType, CompatibleNumberIntegerType > ::value;
-};
-
-
-// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
-template<typename BasicJsonType, typename T>
-struct has_from_json
-{
-  private:
-    // also check the return type of from_json
-    template<typename U, typename = enable_if_t<std::is_same<void, decltype(uncvref_t<U>::from_json(
-                 std::declval<BasicJsonType>(), std::declval<T&>()))>::value>>
-    static int detect(U&&);
-    static void detect(...);
-
-  public:
-    static constexpr bool value = std::is_integral<decltype(
-                                      detect(std::declval<typename BasicJsonType::template json_serializer<T, void>>()))>::value;
-};
-
-// This trait checks if JSONSerializer<T>::from_json(json const&) exists
-// this overload is used for non-default-constructible user-defined-types
-template<typename BasicJsonType, typename T>
-struct has_non_default_from_json
-{
-  private:
-    template <
-        typename U,
-        typename = enable_if_t<std::is_same<
-                                   T, decltype(uncvref_t<U>::from_json(std::declval<BasicJsonType>()))>::value >>
-    static int detect(U&&);
-    static void detect(...);
-
-  public:
-    static constexpr bool value = std::is_integral<decltype(detect(
-                                      std::declval<typename BasicJsonType::template json_serializer<T, void>>()))>::value;
-};
-
-// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
-template<typename BasicJsonType, typename T>
-struct has_to_json
-{
-  private:
-    template<typename U, typename = decltype(uncvref_t<U>::to_json(
-                 std::declval<BasicJsonType&>(), std::declval<T>()))>
-    static int detect(U&&);
-    static void detect(...);
-
-  public:
-    static constexpr bool value = std::is_integral<decltype(detect(
-                                      std::declval<typename BasicJsonType::template json_serializer<T, void>>()))>::value;
-};
-
-
-/////////////
-// to_json //
-/////////////
-
-template<typename BasicJsonType, typename T, enable_if_t<
-             std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
-void to_json(BasicJsonType& j, T b) noexcept
-{
-    external_constructor<value_t::boolean>::construct(j, b);
-}
-
-template<typename BasicJsonType, typename CompatibleString,
-         enable_if_t<std::is_constructible<typename BasicJsonType::string_t,
-                     CompatibleString>::value, int> = 0>
-void to_json(BasicJsonType& j, const CompatibleString& s)
-{
-    external_constructor<value_t::string>::construct(j, s);
-}
-
-template <typename BasicJsonType>
-void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
-{
-    external_constructor<value_t::string>::construct(j, std::move(s));
-}
-
-template<typename BasicJsonType, typename FloatType,
-         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
-void to_json(BasicJsonType& j, FloatType val) noexcept
-{
-    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
-}
-
-template <
-    typename BasicJsonType, typename CompatibleNumberUnsignedType,
-    enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t,
-                CompatibleNumberUnsignedType>::value, int> = 0 >
-void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
-{
-    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
-}
-
-template <
-    typename BasicJsonType, typename CompatibleNumberIntegerType,
-    enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t,
-                CompatibleNumberIntegerType>::value, int> = 0 >
-void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
-{
-    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
-}
-
-template<typename BasicJsonType, typename EnumType,
-         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
-void to_json(BasicJsonType& j, EnumType e) noexcept
-{
-    using underlying_type = typename std::underlying_type<EnumType>::type;
-    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
-}
-
 template<typename BasicJsonType>
-void to_json(BasicJsonType& j, const std::vector<bool>& e)
+void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
 {
-    external_constructor<value_t::array>::construct(j, e);
+    if (JSON_UNLIKELY(not j.is_null()))
+    {
+        JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name())));
+    }
+    n = nullptr;
 }
 
-template <
-    typename BasicJsonType, typename CompatibleArrayType,
-    enable_if_t <
-        is_compatible_array_type<BasicJsonType, CompatibleArrayType>::value or
-        std::is_same<typename BasicJsonType::array_t, CompatibleArrayType>::value,
-        int > = 0 >
-void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
-{
-    external_constructor<value_t::array>::construct(j, arr);
-}
-
-template <typename BasicJsonType, typename T,
-          enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
-void to_json(BasicJsonType& j, std::valarray<T> arr)
-{
-    external_constructor<value_t::array>::construct(j, std::move(arr));
-}
-
-template <typename BasicJsonType>
-void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
-{
-    external_constructor<value_t::array>::construct(j, std::move(arr));
-}
-
-template <
-    typename BasicJsonType, typename CompatibleObjectType,
-    enable_if_t<is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value,
-                int> = 0 >
-void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
-{
-    external_constructor<value_t::object>::construct(j, obj);
-}
-
-template <typename BasicJsonType>
-void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
-{
-    external_constructor<value_t::object>::construct(j, std::move(obj));
-}
-
-template<typename BasicJsonType, typename T, std::size_t N,
-         enable_if_t<not std::is_constructible<
-                         typename BasicJsonType::string_t, T (&)[N]>::value,
-                     int> = 0>
-void to_json(BasicJsonType& j, T (&arr)[N])
-{
-    external_constructor<value_t::array>::construct(j, arr);
-}
-
-template<typename BasicJsonType, typename... Args>
-void to_json(BasicJsonType& j, const std::pair<Args...>& p)
-{
-    j = {p.first, p.second};
-}
-
-template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
-void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...>)
-{
-    j = {std::get<Idx>(t)...};
-}
-
-template<typename BasicJsonType, typename... Args>
-void to_json(BasicJsonType& j, const std::tuple<Args...>& t)
-{
-    to_json_tuple_impl(j, t, index_sequence_for<Args...> {});
-}
-
-///////////////
-// from_json //
-///////////////
-
 // overloads for basic_json template parameters
 template<typename BasicJsonType, typename ArithmeticType,
          enable_if_t<std::is_arithmetic<ArithmeticType>::value and
-                     not std::is_same<ArithmeticType,
-                                      typename BasicJsonType::boolean_t>::value,
+                     not std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
                      int> = 0>
 void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
 {
@@ -1127,6 +1111,23 @@ void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
     s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
 }
 
+template <
+    typename BasicJsonType, typename CompatibleStringType,
+    enable_if_t <
+        is_compatible_string_type<BasicJsonType, CompatibleStringType>::value and
+        not std::is_same<typename BasicJsonType::string_t,
+                         CompatibleStringType>::value,
+        int > = 0 >
+void from_json(const BasicJsonType& j, CompatibleStringType& s)
+{
+    if (JSON_UNLIKELY(not j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
 template<typename BasicJsonType>
 void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
 {
@@ -1154,16 +1155,6 @@ void from_json(const BasicJsonType& j, EnumType& e)
     e = static_cast<EnumType>(val);
 }
 
-template<typename BasicJsonType>
-void from_json(const BasicJsonType& j, typename BasicJsonType::array_t& arr)
-{
-    if (JSON_UNLIKELY(not j.is_array()))
-    {
-        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
-    }
-    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
-}
-
 // forward_list doesn't have an insert method
 template<typename BasicJsonType, typename T, typename Allocator,
          enable_if_t<std::is_convertible<BasicJsonType, T>::value, int> = 0>
@@ -1190,30 +1181,31 @@ void from_json(const BasicJsonType& j, std::valarray<T>& l)
         JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
     }
     l.resize(j.size());
-    for (size_t i = 0; i < j.size(); ++i)
-    {
-        l[i] = j[i];
-    }
+    std::copy(j.m_value.array->begin(), j.m_value.array->end(), std::begin(l));
 }
 
-template<typename BasicJsonType, typename CompatibleArrayType>
-void from_json_array_impl(const BasicJsonType& j, CompatibleArrayType& arr, priority_tag<0> /*unused*/)
+template<typename BasicJsonType>
+void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
 {
-    using std::end;
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
 
-    std::transform(j.begin(), j.end(),
-                   std::inserter(arr, end(arr)), [](const BasicJsonType & i)
+template <typename BasicJsonType, typename T, std::size_t N>
+auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
+                          priority_tag<2> /*unused*/)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
     {
-        // get<BasicJsonType>() returns *this, this won't call a from_json
-        // method when value_type is BasicJsonType
-        return i.template get<typename CompatibleArrayType::value_type>();
-    });
+        arr[i] = j.at(i).template get<T>();
+    }
 }
 
 template<typename BasicJsonType, typename CompatibleArrayType>
 auto from_json_array_impl(const BasicJsonType& j, CompatibleArrayType& arr, priority_tag<1> /*unused*/)
 -> decltype(
     arr.reserve(std::declval<typename CompatibleArrayType::size_type>()),
+    j.template get<typename CompatibleArrayType::value_type>(),
     void())
 {
     using std::end;
@@ -1228,27 +1220,42 @@ auto from_json_array_impl(const BasicJsonType& j, CompatibleArrayType& arr, prio
     });
 }
 
-template<typename BasicJsonType, typename T, std::size_t N>
-void from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr, priority_tag<2> /*unused*/)
+template <typename BasicJsonType, typename CompatibleArrayType>
+void from_json_array_impl(const BasicJsonType& j, CompatibleArrayType& arr,
+                          priority_tag<0> /*unused*/)
 {
-    for (std::size_t i = 0; i < N; ++i)
+    using std::end;
+
+    std::transform(
+        j.begin(), j.end(), std::inserter(arr, end(arr)),
+        [](const BasicJsonType & i)
     {
-        arr[i] = j.at(i).template get<T>();
-    }
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename CompatibleArrayType::value_type>();
+    });
 }
 
-template<typename BasicJsonType, typename CompatibleArrayType,
-         enable_if_t<is_compatible_array_type<BasicJsonType, CompatibleArrayType>::value and
-                     std::is_convertible<BasicJsonType, typename CompatibleArrayType::value_type>::value and
-                     not std::is_same<typename BasicJsonType::array_t, CompatibleArrayType>::value, int> = 0>
-void from_json(const BasicJsonType& j, CompatibleArrayType& arr)
+template <typename BasicJsonType, typename CompatibleArrayType,
+          enable_if_t <
+              is_compatible_array_type<BasicJsonType, CompatibleArrayType>::value and
+              not is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value and
+              not is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value and
+              not is_basic_json<CompatibleArrayType>::value,
+              int > = 0 >
+
+auto from_json(const BasicJsonType& j, CompatibleArrayType& arr)
+-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
+j.template get<typename CompatibleArrayType::value_type>(),
+void())
 {
     if (JSON_UNLIKELY(not j.is_array()))
     {
-        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+        JSON_THROW(type_error::create(302, "type must be array, but is " +
+                                      std::string(j.type_name())));
     }
 
-    from_json_array_impl(j, arr, priority_tag<2> {});
+    from_json_array_impl(j, arr, priority_tag<3> {});
 }
 
 template<typename BasicJsonType, typename CompatibleObjectType,
@@ -1331,189 +1338,630 @@ void from_json(const BasicJsonType& j, std::tuple<Args...>& t)
     from_json_tuple_impl(j, t, index_sequence_for<Args...> {});
 }
 
-struct to_json_fn
+template <typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
+          typename = enable_if_t<not std::is_constructible<
+                                     typename BasicJsonType::string_t, Key>::value>>
+void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
 {
-  private:
-    template<typename BasicJsonType, typename T>
-    auto call(BasicJsonType& j, T&& val, priority_tag<1> /*unused*/) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
-    -> decltype(to_json(j, std::forward<T>(val)), void())
+    if (JSON_UNLIKELY(not j.is_array()))
     {
-        return to_json(j, std::forward<T>(val));
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
     }
+    for (const auto& p : j)
+    {
+        if (JSON_UNLIKELY(not p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
 
-    template<typename BasicJsonType, typename T>
-    void call(BasicJsonType& /*unused*/, T&& /*unused*/, priority_tag<0> /*unused*/) const noexcept
+template <typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
+          typename = enable_if_t<not std::is_constructible<
+                                     typename BasicJsonType::string_t, Key>::value>>
+void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
+{
+    if (JSON_UNLIKELY(not j.is_array()))
     {
-        static_assert(sizeof(BasicJsonType) == 0,
-                      "could not find to_json() method in T's namespace");
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
     }
-
-  public:
-    template<typename BasicJsonType, typename T>
-    void operator()(BasicJsonType& j, T&& val) const
-    noexcept(noexcept(std::declval<to_json_fn>().call(j, std::forward<T>(val), priority_tag<1> {})))
+    for (const auto& p : j)
     {
-        return call(j, std::forward<T>(val), priority_tag<1> {});
+        if (JSON_UNLIKELY(not p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
     }
-};
+}
 
 struct from_json_fn
 {
-  private:
     template<typename BasicJsonType, typename T>
-    auto call(const BasicJsonType& j, T& val, priority_tag<1> /*unused*/) const
+    auto operator()(const BasicJsonType& j, T& val) const
     noexcept(noexcept(from_json(j, val)))
     -> decltype(from_json(j, val), void())
     {
         return from_json(j, val);
     }
+};
+}
 
-    template<typename BasicJsonType, typename T>
-    void call(const BasicJsonType& /*unused*/, T& /*unused*/, priority_tag<0> /*unused*/) const noexcept
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace
+{
+constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value;
+}
+}
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+
+#include <ciso646> // or, and, not
+#include <iterator> // begin, end
+#include <tuple> // tuple, get
+#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility> // move, forward, declval, pair
+#include <valarray> // valarray
+#include <vector> // vector
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+
+#include <cstddef> // size_t
+#include <string> // string, to_string
+#include <iterator> // input_iterator_tag
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// proxy class for the items() function
+template<typename IteratorType> class iteration_proxy
+{
+  private:
+    /// helper class for iteration
+    class iteration_proxy_internal
     {
-        static_assert(sizeof(BasicJsonType) == 0,"could not find from_json() method in T's namespace");
-    }
+      public:
+        using difference_type = std::ptrdiff_t;
+        using value_type = iteration_proxy_internal;
+        using pointer = iteration_proxy_internal*;
+        using reference = iteration_proxy_internal&;
+        using iterator_category = std::input_iterator_tag;
+
+      private:
+        /// the iterator
+        IteratorType anchor;
+        /// an index for arrays (used to create key names)
+        std::size_t array_index = 0;
+        /// last stringified array index
+        mutable std::size_t array_index_last = 0;
+        /// a string representation of the array index
+        mutable std::string array_index_str = "0";
+        /// an empty string (to return a reference for primitive values)
+        const std::string empty_str = "";
+
+      public:
+        explicit iteration_proxy_internal(IteratorType it) noexcept : anchor(it) {}
+
+        iteration_proxy_internal(const iteration_proxy_internal&) = default;
+        iteration_proxy_internal& operator=(const iteration_proxy_internal&) = default;
+
+        /// dereference operator (needed for range-based for)
+        iteration_proxy_internal& operator*()
+        {
+            return *this;
+        }
+
+        /// increment operator (needed for range-based for)
+        iteration_proxy_internal& operator++()
+        {
+            ++anchor;
+            ++array_index;
+
+            return *this;
+        }
+
+        /// equality operator (needed for InputIterator)
+        bool operator==(const iteration_proxy_internal& o) const noexcept
+        {
+            return anchor == o.anchor;
+        }
+
+        /// inequality operator (needed for range-based for)
+        bool operator!=(const iteration_proxy_internal& o) const noexcept
+        {
+            return anchor != o.anchor;
+        }
+
+        /// return key of the iterator
+        const std::string& key() const
+        {
+            assert(anchor.m_object != nullptr);
+
+            switch (anchor.m_object->type())
+            {
+                // use integer array index as key
+                case value_t::array:
+                {
+                    if (array_index != array_index_last)
+                    {
+                        array_index_str = std::to_string(array_index);
+                        array_index_last = array_index;
+                    }
+                    return array_index_str;
+                }
+
+                // use key from the object
+                case value_t::object:
+                    return anchor.key();
+
+                // use an empty key for all primitive types
+                default:
+                    return empty_str;
+            }
+        }
+
+        /// return value of the iterator
+        typename IteratorType::reference value() const
+        {
+            return anchor.value();
+        }
+    };
+
+    /// the container to iterate
+    typename IteratorType::reference container;
 
   public:
-    template<typename BasicJsonType, typename T>
-    void operator()(const BasicJsonType& j, T& val) const
-    noexcept(noexcept(std::declval<from_json_fn>().call(j, val, priority_tag<1> {})))
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
+        : container(cont) {}
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_internal begin() noexcept
     {
-        return call(j, val, priority_tag<1> {});
+        return iteration_proxy_internal(container.begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_internal end() noexcept
+    {
+        return iteration_proxy_internal(container.end());
+    }
+};
+}
+}
+
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////
+// constructors //
+//////////////////
+
+template<value_t> struct external_constructor;
+
+template<>
+struct external_constructor<value_t::boolean>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
+    {
+        j.m_type = value_t::boolean;
+        j.m_value = b;
+        j.assert_invariant();
     }
 };
 
-// taken from ranges-v3
-template<typename T>
-struct static_const
+template<>
+struct external_constructor<value_t::string>
 {
-    static constexpr T value{};
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = s;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename CompatibleStringType,
+             enable_if_t<not std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
+                         int> = 0>
+    static void construct(BasicJsonType& j, const CompatibleStringType& str)
+    {
+        j.m_type = value_t::string;
+        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
 };
 
-template<typename T>
-constexpr T static_const<T>::value;
+template<>
+struct external_constructor<value_t::number_float>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
+    {
+        j.m_type = value_t::number_float;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_unsigned>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
+    {
+        j.m_type = value_t::number_unsigned;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_integer>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
+    {
+        j.m_type = value_t::number_integer;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::array>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = arr;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = std::move(arr);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename CompatibleArrayType,
+             enable_if_t<not std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
+                         int> = 0>
+    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
+    {
+        using std::begin;
+        using std::end;
+        j.m_type = value_t::array;
+        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->reserve(arr.size());
+        for (const bool x : arr)
+        {
+            j.m_value.array->push_back(x);
+        }
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename T,
+             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->resize(arr.size());
+        std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::object>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = obj;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = std::move(obj);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename CompatibleObjectType,
+             enable_if_t<not std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int> = 0>
+    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_type = value_t::object;
+        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+void to_json(BasicJsonType& j, T b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template<typename BasicJsonType, typename CompatibleString,
+         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleString& s)
+{
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+{
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template<typename BasicJsonType, typename FloatType,
+         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+void to_json(BasicJsonType& j, FloatType val) noexcept
+{
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
+{
+    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberIntegerType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
+{
+    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void to_json(BasicJsonType& j, EnumType e) noexcept
+{
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const std::vector<bool>& e)
+{
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template <typename BasicJsonType, typename CompatibleArrayType,
+          enable_if_t<is_compatible_array_type<BasicJsonType,
+                      CompatibleArrayType>::value and
+                      not is_compatible_object_type<
+                          BasicJsonType, CompatibleArrayType>::value and
+                      not is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value and
+                      not is_basic_json<CompatibleArrayType>::value,
+                      int> = 0>
+void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::valarray<T>& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType, typename CompatibleObjectType,
+         enable_if_t<is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value and not is_basic_json<CompatibleObjectType>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
+{
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+{
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType, typename T, std::size_t N,
+    enable_if_t<not std::is_constructible<typename BasicJsonType::string_t,
+                const T (&)[N]>::value,
+                int> = 0 >
+void to_json(BasicJsonType& j, const T (&arr)[N])
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType, typename... Args>
+void to_json(BasicJsonType& j, const std::pair<Args...>& p)
+{
+    j = {p.first, p.second};
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename iteration_proxy<typename BasicJsonType::iterator>::iteration_proxy_internal>::value, int> = 0>
+void to_json(BasicJsonType& j, T b) noexcept
+{
+    j = {{b.key(), b.value()}};
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...>)
+{
+    j = {std::get<Idx>(t)...};
+}
+
+template<typename BasicJsonType, typename... Args>
+void to_json(BasicJsonType& j, const std::tuple<Args...>& t)
+{
+    to_json_tuple_impl(j, t, index_sequence_for<Args...> {});
+}
+
+struct to_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+    -> decltype(to_json(j, std::forward<T>(val)), void())
+    {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}
+
+/// namespace to hold default `to_json` function
+namespace
+{
+constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value;
+}
+}
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+
+#include <cassert> // assert
+#include <cstddef> // size_t
+#include <cstring> // strlen
+#include <istream> // istream
+#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory> // shared_ptr, make_shared, addressof
+#include <numeric> // accumulate
+#include <string> // string, char_traits
+#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility> // pair, declval
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson };
 
 ////////////////////
 // input adapters //
 ////////////////////
 
-/// abstract input adapter interface
+/*!
+@brief abstract input adapter interface
+
+Produces a stream of std::char_traits<char>::int_type characters from a
+std::istream, a buffer, or some other input type. Accepts the return of
+exactly one non-EOF character for future input. The int_type characters
+returned consist of all valid char values as positive values (typically
+unsigned char), plus an EOF value outside that range, specified by the value
+of the function std::char_traits<char>::eof(). This value is typically -1, but
+could be any arbitrary value which is not a valid char value.
+*/
 struct input_adapter_protocol
 {
-    virtual int get_character() = 0;
-    virtual std::string read(std::size_t offset, std::size_t length) = 0;
+    /// get a character [0,255] or std::char_traits<char>::eof().
+    virtual std::char_traits<char>::int_type get_character() = 0;
     virtual ~input_adapter_protocol() = default;
 };
 
 /// a type to simplify interfaces
 using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
 
-/// input adapter for cached stream input
-template<std::size_t BufferSize>
-class cached_input_stream_adapter : public input_adapter_protocol
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter : public input_adapter_protocol
 {
   public:
-    explicit cached_input_stream_adapter(std::istream& i)
-        : is(i), start_position(is.tellg())
+    ~input_stream_adapter() override
     {
-        fill_buffer();
-
-        // skip byte order mark
-        if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF')
-        {
-            buffer_pos += 3;
-            processed_chars += 3;
-        }
-    }
-
-    ~cached_input_stream_adapter() override
-    {
-        // clear stream flags
-        is.clear();
-        // We initially read a lot of characters into the buffer, and we may
-        // not have processed all of them. Therefore, we need to "rewind" the
-        // stream after the last processed char.
-        is.seekg(start_position);
-        is.ignore(static_cast<std::streamsize>(processed_chars));
-        // clear stream flags
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags
         is.clear();
     }
 
-    int get_character() override
+    explicit input_stream_adapter(std::istream& i)
+        : is(i), sb(*i.rdbuf())
+    {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&) = delete;
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, eg. 0xFFFFFFFF.
+    std::char_traits<char>::int_type get_character() override
     {
-        // check if refilling is necessary and possible
-        if (buffer_pos == fill_size and not eof)
-        {
-            fill_buffer();
-
-            // check and remember that filling did not yield new input
-            if (fill_size == 0)
-            {
-                eof = true;
-                return std::char_traits<char>::eof();
-            }
-
-            // the buffer is ready
-            buffer_pos = 0;
-        }
-
-        ++processed_chars;
-        assert(buffer_pos < buffer.size());
-        return buffer[buffer_pos++] & 0xFF;
-    }
-
-    std::string read(std::size_t offset, std::size_t length) override
-    {
-        // create buffer
-        std::string result(length, '\0');
-
-        // save stream position
-        const auto current_pos = is.tellg();
-        // save stream flags
-        const auto flags = is.rdstate();
-
-        // clear stream flags
-        is.clear();
-        // set stream position
-        is.seekg(static_cast<std::streamoff>(offset));
-        // read bytes
-        is.read(&result[0], static_cast<std::streamsize>(length));
-
-        // reset stream position
-        is.seekg(current_pos);
-        // reset stream flags
-        is.setstate(flags);
-
-        return result;
+        return sb.sbumpc();
     }
 
   private:
-    void fill_buffer()
-    {
-        // fill
-        is.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
-        // store number of bytes in the buffer
-        fill_size = static_cast<size_t>(is.gcount());
-    }
-
     /// the associated input stream
     std::istream& is;
-
-    /// chars returned via get_character()
-    std::size_t processed_chars = 0;
-    /// chars processed in the current buffer
-    std::size_t buffer_pos = 0;
-
-    /// whether stream reached eof
-    bool eof = false;
-    /// how many chars have been copied to the buffer by last (re)fill
-    std::size_t fill_size = 0;
-
-    /// position of the stream when we started
-    const std::streampos start_position;
-
-    /// internal buffer
-    std::array<char, BufferSize> buffer{{}};
+    std::streambuf& sb;
 };
 
 /// input adapter for buffer input
@@ -1521,43 +1969,181 @@ class input_buffer_adapter : public input_adapter_protocol
 {
   public:
     input_buffer_adapter(const char* b, const std::size_t l)
-        : cursor(b), limit(b + l), start(b)
-    {
-        // skip byte order mark
-        if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
-        {
-            cursor += 3;
-        }
-    }
+        : cursor(b), limit(b + l)
+    {}
 
     // delete because of pointer members
     input_buffer_adapter(const input_buffer_adapter&) = delete;
     input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
 
-    int get_character() noexcept override
+    std::char_traits<char>::int_type get_character() noexcept override
     {
         if (JSON_LIKELY(cursor < limit))
         {
-            return *(cursor++) & 0xFF;
+            return std::char_traits<char>::to_int_type(*(cursor++));
         }
 
         return std::char_traits<char>::eof();
     }
 
-    std::string read(std::size_t offset, std::size_t length) override
-    {
-        // avoid reading too many characters
-        const auto max_length = static_cast<size_t>(limit - start);
-        return std::string(start + offset, (std::min)(length, max_length - offset));
-    }
-
   private:
     /// pointer to the current character
     const char* cursor;
     /// pointer past the last character
-    const char* limit;
-    /// pointer to the first character
-    const char* start;
+    const char* const limit;
+};
+
+template<typename WideStringType>
+class wide_string_input_adapter : public input_adapter_protocol
+{
+  public:
+    explicit wide_string_input_adapter(const WideStringType& w) : str(w) {}
+
+    std::char_traits<char>::int_type get_character() noexcept override
+    {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            if (sizeof(typename WideStringType::value_type) == 2)
+            {
+                fill_buffer_utf16();
+            }
+            else
+            {
+                fill_buffer_utf32();
+            }
+
+            assert(utf8_bytes_filled > 0);
+            assert(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        assert(utf8_bytes_filled > 0);
+        assert(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+  private:
+    void fill_buffer_utf16()
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const int wc = static_cast<int>(str[current_wchar++]);
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = 0xC0 | ((wc >> 6));
+                utf8_bytes[1] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc or wc >= 0xE000)
+            {
+                utf8_bytes[0] = 0xE0 | ((wc >> 12));
+                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[2] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (current_wchar < str.size())
+                {
+                    const int wc2 = static_cast<int>(str[current_wchar++]);
+                    const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
+                    utf8_bytes[0] = 0xf0 | (charcode >> 18);
+                    utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
+                    utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
+                    utf8_bytes[3] = 0x80 | (charcode & 0x3F);
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    // unknown character
+                    ++current_wchar;
+                    utf8_bytes[0] = wc;
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+
+    void fill_buffer_utf32()
+    {
+        utf8_bytes_index = 0;
+
+        if (current_wchar == str.size())
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const int wc = static_cast<int>(str[current_wchar++]);
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
+                utf8_bytes[1] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
+                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[2] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
+                utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
+                utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
+                utf8_bytes[3] = 0x80 | (wc & 0x3F);
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = wc;
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+
+  private:
+    /// the wstring to process
+    const WideStringType& str;
+
+    /// index of the current wchar in str
+    std::size_t current_wchar = 0;
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
 };
 
 class input_adapter
@@ -1567,18 +2153,26 @@ class input_adapter
 
     /// input adapter for input stream
     input_adapter(std::istream& i)
-        : ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
+        : ia(std::make_shared<input_stream_adapter>(i)) {}
 
     /// input adapter for input stream
     input_adapter(std::istream&& i)
-        : ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
+        : ia(std::make_shared<input_stream_adapter>(i)) {}
+
+    input_adapter(const std::wstring& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
+
+    input_adapter(const std::u16string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
+
+    input_adapter(const std::u32string& ws)
+        : ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
 
     /// input adapter for buffer
     template<typename CharT,
              typename std::enable_if<
                  std::is_pointer<CharT>::value and
-                 std::is_integral<
-                     typename std::remove_pointer<CharT>::type>::value and
+                 std::is_integral<typename std::remove_pointer<CharT>::type>::value and
                  sizeof(typename std::remove_pointer<CharT>::type) == 1,
                  int>::type = 0>
     input_adapter(CharT b, std::size_t l)
@@ -1590,8 +2184,7 @@ class input_adapter
     template<typename CharT,
              typename std::enable_if<
                  std::is_pointer<CharT>::value and
-                 std::is_integral<
-                     typename std::remove_pointer<CharT>::type>::value and
+                 std::is_integral<typename std::remove_pointer<CharT>::type>::value and
                  sizeof(typename std::remove_pointer<CharT>::type) == 1,
                  int>::type = 0>
     input_adapter(CharT b)
@@ -1601,20 +2194,22 @@ class input_adapter
     /// input adapter for iterator range with contiguous storage
     template<class IteratorType,
              typename std::enable_if<
-                 std::is_same<typename std::iterator_traits<IteratorType>::iterator_category,
-                              std::random_access_iterator_tag>::value,
+                 std::is_same<typename std::iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
                  int>::type = 0>
     input_adapter(IteratorType first, IteratorType last)
     {
+#ifndef NDEBUG
         // assertion to check that the iterator range is indeed contiguous,
         // see http://stackoverflow.com/a/35008842/266378 for more discussion
-        assert(std::accumulate(
-                   first, last, std::pair<bool, int>(true, 0),
-                   [&first](std::pair<bool, int> res, decltype(*first) val)
+        const auto is_contiguous = std::accumulate(
+                                       first, last, std::pair<bool, int>(true, 0),
+                                       [&first](std::pair<bool, int> res, decltype(*first) val)
         {
             res.first &= (val == *(std::next(std::addressof(*first), res.second++)));
             return res;
-        }).first);
+        }).first;
+        assert(is_contiguous);
+#endif
 
         // assertion to check that each element is 1 byte long
         static_assert(
@@ -1640,13 +2235,10 @@ class input_adapter
         : input_adapter(std::begin(array), std::end(array)) {}
 
     /// input adapter for contiguous container
-    template <
-        class ContiguousContainer,
-        typename std::enable_if <
-            not std::is_pointer<ContiguousContainer>::value and
-            std::is_base_of<std::random_access_iterator_tag,
-                            typename std::iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
-            int >::type = 0 >
+    template<class ContiguousContainer, typename
+             std::enable_if<not std::is_pointer<ContiguousContainer>::value and
+                            std::is_base_of<std::random_access_iterator_tag, typename std::iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
+                            int>::type = 0>
     input_adapter(const ContiguousContainer& c)
         : input_adapter(std::begin(c), std::end(c)) {}
 
@@ -1659,10 +2251,32 @@ class input_adapter
     /// the actual adapter
     input_adapter_t ia = nullptr;
 };
+}
+}
 
-//////////////////////
-// lexer and parser //
-//////////////////////
+// #include <nlohmann/detail/input/lexer.hpp>
+
+
+#include <clocale> // localeconv
+#include <cstddef> // size_t
+#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
+#include <cstdio> // snprintf
+#include <initializer_list> // initializer_list
+#include <string> // char_traits, string
+#include <vector> // vector
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////
+// lexer //
+///////////
 
 /*!
 @brief lexical analysis
@@ -1675,6 +2289,7 @@ class lexer
     using number_integer_t = typename BasicJsonType::number_integer_t;
     using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
     using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
 
   public:
     /// token types for the parser
@@ -1736,12 +2351,14 @@ class lexer
                 return "end of input";
             case token_type::literal_or_value:
                 return "'[', '{', or a literal";
+            // LCOV_EXCL_START
             default: // catch non-enum values
-                return "unknown token"; // LCOV_EXCL_LINE
+                return "unknown token";
+                // LCOV_EXCL_STOP
         }
     }
 
-    explicit lexer(detail::input_adapter_t adapter)
+    explicit lexer(detail::input_adapter_t&& adapter)
         : ia(std::move(adapter)), decimal_point_char(get_decimal_point()) {}
 
     // delete because of pointer members
@@ -1758,7 +2375,7 @@ class lexer
     {
         const auto loc = localeconv();
         assert(loc != nullptr);
-        return (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0];
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
     }
 
     /////////////////////
@@ -1820,6 +2437,12 @@ class lexer
     checks if it is inside the range. If a violation was detected, set up an
     error message and return false. Otherwise, return true.
 
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
     @return true if and only if no range violation was detected
     */
     bool next_byte_in_range(std::initializer_list<int> ranges)
@@ -1848,9 +2471,10 @@ class lexer
     @brief scan a string literal
 
     This function scans a string according to Sect. 7 of RFC 7159. While
-    scanning, bytes are escaped and copied into buffer yytext. Then the
-    function returns successfully, yytext is null-terminated and yylen
-    contains the number of bytes in the string.
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
 
     @return token_type::value_string if string could be successfully scanned,
             token_type::parse_error otherwise
@@ -1860,7 +2484,7 @@ class lexer
     */
     token_type scan_string()
     {
-        // reset yytext (ignore opening quote)
+        // reset token_buffer (ignore opening quote)
         reset();
 
         // we entered the function by reading an open quote
@@ -1881,9 +2505,6 @@ class lexer
                 // closing quote
                 case '\"':
                 {
-                    // terminate yytext
-                    add('\0');
-                    --yylen;
                     return token_type::value_string;
                 }
 
@@ -1928,8 +2549,8 @@ class lexer
                         // unicode escapes
                         case 'u':
                         {
-                            int codepoint;
                             const int codepoint1 = get_codepoint();
+                            int codepoint = codepoint1; // start with codepoint1
 
                             if (JSON_UNLIKELY(codepoint1 == -1))
                             {
@@ -1954,6 +2575,7 @@ class lexer
                                     // check if codepoint2 is a low surrogate
                                     if (JSON_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF))
                                     {
+                                        // overwrite codepoint
                                         codepoint =
                                             // high surrogate occupies the most significant 22 bits
                                             (codepoint1 << 10)
@@ -1983,27 +2605,24 @@ class lexer
                                     error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
                                     return token_type::parse_error;
                                 }
-
-                                // only work with first code point
-                                codepoint = codepoint1;
                             }
 
                             // result of the above calculation yields a proper codepoint
                             assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
 
-                            // translate code point to bytes
+                            // translate codepoint into bytes
                             if (codepoint < 0x80)
                             {
                                 // 1-byte characters: 0xxxxxxx (ASCII)
                                 add(codepoint);
                             }
-                            else if (codepoint <= 0x7ff)
+                            else if (codepoint <= 0x7FF)
                             {
                                 // 2-byte characters: 110xxxxx 10xxxxxx
                                 add(0xC0 | (codepoint >> 6));
                                 add(0x80 | (codepoint & 0x3F));
                             }
-                            else if (codepoint <= 0xffff)
+                            else if (codepoint <= 0xFFFF)
                             {
                                 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
                                 add(0xE0 | (codepoint >> 12));
@@ -2042,12 +2661,12 @@ class lexer
                 case 0x07:
                 case 0x08:
                 case 0x09:
-                case 0x0a:
-                case 0x0b:
-                case 0x0c:
-                case 0x0d:
-                case 0x0e:
-                case 0x0f:
+                case 0x0A:
+                case 0x0B:
+                case 0x0C:
+                case 0x0D:
+                case 0x0E:
+                case 0x0F:
                 case 0x10:
                 case 0x11:
                 case 0x12:
@@ -2058,12 +2677,12 @@ class lexer
                 case 0x17:
                 case 0x18:
                 case 0x19:
-                case 0x1a:
-                case 0x1b:
-                case 0x1c:
-                case 0x1d:
-                case 0x1e:
-                case 0x1f:
+                case 0x1A:
+                case 0x1B:
+                case 0x1C:
+                case 0x1D:
+                case 0x1E:
+                case 0x1F:
                 {
                     error_message = "invalid string: control character must be escaped";
                     return token_type::parse_error;
@@ -2079,12 +2698,12 @@ class lexer
                 case 0x27:
                 case 0x28:
                 case 0x29:
-                case 0x2a:
-                case 0x2b:
-                case 0x2c:
-                case 0x2d:
-                case 0x2e:
-                case 0x2f:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
                 case 0x30:
                 case 0x31:
                 case 0x32:
@@ -2095,12 +2714,12 @@ class lexer
                 case 0x37:
                 case 0x38:
                 case 0x39:
-                case 0x3a:
-                case 0x3b:
-                case 0x3c:
-                case 0x3d:
-                case 0x3e:
-                case 0x3f:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
                 case 0x40:
                 case 0x41:
                 case 0x42:
@@ -2111,12 +2730,12 @@ class lexer
                 case 0x47:
                 case 0x48:
                 case 0x49:
-                case 0x4a:
-                case 0x4b:
-                case 0x4c:
-                case 0x4d:
-                case 0x4e:
-                case 0x4f:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
                 case 0x50:
                 case 0x51:
                 case 0x52:
@@ -2127,11 +2746,11 @@ class lexer
                 case 0x57:
                 case 0x58:
                 case 0x59:
-                case 0x5a:
-                case 0x5b:
-                case 0x5d:
-                case 0x5e:
-                case 0x5f:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
                 case 0x60:
                 case 0x61:
                 case 0x62:
@@ -2142,12 +2761,12 @@ class lexer
                 case 0x67:
                 case 0x68:
                 case 0x69:
-                case 0x6a:
-                case 0x6b:
-                case 0x6c:
-                case 0x6d:
-                case 0x6e:
-                case 0x6f:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
                 case 0x70:
                 case 0x71:
                 case 0x72:
@@ -2158,48 +2777,48 @@ class lexer
                 case 0x77:
                 case 0x78:
                 case 0x79:
-                case 0x7a:
-                case 0x7b:
-                case 0x7c:
-                case 0x7d:
-                case 0x7e:
-                case 0x7f:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F:
                 {
                     add(current);
                     break;
                 }
 
                 // U+0080..U+07FF: bytes C2..DF 80..BF
-                case 0xc2:
-                case 0xc3:
-                case 0xc4:
-                case 0xc5:
-                case 0xc6:
-                case 0xc7:
-                case 0xc8:
-                case 0xc9:
-                case 0xca:
-                case 0xcb:
-                case 0xcc:
-                case 0xcd:
-                case 0xce:
-                case 0xcf:
-                case 0xd0:
-                case 0xd1:
-                case 0xd2:
-                case 0xd3:
-                case 0xd4:
-                case 0xd5:
-                case 0xd6:
-                case 0xd7:
-                case 0xd8:
-                case 0xd9:
-                case 0xda:
-                case 0xdb:
-                case 0xdc:
-                case 0xdd:
-                case 0xde:
-                case 0xdf:
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF:
                 {
                     if (JSON_UNLIKELY(not next_byte_in_range({0x80, 0xBF})))
                     {
@@ -2209,7 +2828,7 @@ class lexer
                 }
 
                 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
-                case 0xe0:
+                case 0xE0:
                 {
                     if (JSON_UNLIKELY(not (next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
                     {
@@ -2220,20 +2839,20 @@ class lexer
 
                 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
                 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
-                case 0xe1:
-                case 0xe2:
-                case 0xe3:
-                case 0xe4:
-                case 0xe5:
-                case 0xe6:
-                case 0xe7:
-                case 0xe8:
-                case 0xe9:
-                case 0xea:
-                case 0xeb:
-                case 0xec:
-                case 0xee:
-                case 0xef:
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF:
                 {
                     if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
                     {
@@ -2243,7 +2862,7 @@ class lexer
                 }
 
                 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
-                case 0xed:
+                case 0xED:
                 {
                     if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
                     {
@@ -2253,7 +2872,7 @@ class lexer
                 }
 
                 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
-                case 0xf0:
+                case 0xF0:
                 {
                     if (JSON_UNLIKELY(not (next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
                     {
@@ -2263,9 +2882,9 @@ class lexer
                 }
 
                 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
-                case 0xf1:
-                case 0xf2:
-                case 0xf3:
+                case 0xF1:
+                case 0xF2:
+                case 0xF3:
                 {
                     if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
                     {
@@ -2275,7 +2894,7 @@ class lexer
                 }
 
                 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
-                case 0xf4:
+                case 0xF4:
                 {
                     if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
                     {
@@ -2337,7 +2956,7 @@ class lexer
     contains cycles, but any cycle can be left when EOF is read. Therefore,
     the function is guaranteed to terminate.
 
-    During scanning, the read bytes are stored in yytext. This string is
+    During scanning, the read bytes are stored in token_buffer. This string is
     then converted to a signed integer, an unsigned integer, or a
     floating-point number.
 
@@ -2351,7 +2970,7 @@ class lexer
     */
     token_type scan_number()
     {
-        // reset yytext to store the number's bytes
+        // reset token_buffer to store the number's bytes
         reset();
 
         // the type of the parsed number; initially set to unsigned; will be
@@ -2387,11 +3006,13 @@ class lexer
                 goto scan_number_any1;
             }
 
+            // LCOV_EXCL_START
             default:
             {
                 // all other characters are rejected outside scan_number()
-                assert(false); // LCOV_EXCL_LINE
+                assert(false);
             }
+                // LCOV_EXCL_STOP
         }
 
 scan_number_minus:
@@ -2627,12 +3248,7 @@ scan_number_any2:
 scan_number_done:
         // unget the character after the number (we only read it to know that
         // we are done scanning a number)
-        --chars_read;
-        next_unget = true;
-
-        // terminate token
-        add('\0');
-        --yylen;
+        unget();
 
         char* endptr = nullptr;
         errno = 0;
@@ -2640,10 +3256,10 @@ scan_number_done:
         // try to parse integers first and fall back to floats
         if (number_type == token_type::value_unsigned)
         {
-            const auto x = std::strtoull(yytext.data(), &endptr, 10);
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
 
             // we checked the number format before
-            assert(endptr == yytext.data() + yylen);
+            assert(endptr == token_buffer.data() + token_buffer.size());
 
             if (errno == 0)
             {
@@ -2656,10 +3272,10 @@ scan_number_done:
         }
         else if (number_type == token_type::value_integer)
         {
-            const auto x = std::strtoll(yytext.data(), &endptr, 10);
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
 
             // we checked the number format before
-            assert(endptr == yytext.data() + yylen);
+            assert(endptr == token_buffer.data() + token_buffer.size());
 
             if (errno == 0)
             {
@@ -2673,10 +3289,10 @@ scan_number_done:
 
         // this code is reached if we parse a floating-point number or if an
         // integer conversion above failed
-        strtof(value_float, yytext.data(), &endptr);
+        strtof(value_float, token_buffer.data(), &endptr);
 
         // we checked the number format before
-        assert(endptr == yytext.data() + yylen);
+        assert(endptr == token_buffer.data() + token_buffer.size());
 
         return token_type::value_float;
     }
@@ -2705,32 +3321,67 @@ scan_number_done:
     // input management
     /////////////////////
 
-    /// reset yytext
+    /// reset token_buffer; current character is beginning of token
     void reset() noexcept
     {
-        yylen = 0;
-        start_pos = chars_read - 1;
+        token_buffer.clear();
+        token_string.clear();
+        token_string.push_back(std::char_traits<char>::to_char_type(current));
     }
 
-    /// get a character from the input
-    int get()
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    std::char_traits<char>::int_type get()
     {
         ++chars_read;
-        return next_unget ? (next_unget = false, current)
-               : (current = ia->get_character());
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia->get_character();
+        }
+
+        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
+        {
+            token_string.push_back(std::char_traits<char>::to_char_type(current));
+        }
+        return current;
     }
 
-    /// add a character to yytext
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read and
+    token_string. The next call to get() will behave as if the unget character
+    is read again.
+    */
+    void unget()
+    {
+        next_unget = true;
+        --chars_read;
+        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
+        {
+            assert(token_string.size() != 0);
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
     void add(int c)
     {
-        // resize yytext if necessary; this condition is deemed unlikely,
-        // because we start with a 1024-byte buffer
-        if (JSON_UNLIKELY((yylen + 1 > yytext.capacity())))
-        {
-            yytext.resize(2 * yytext.capacity(), '\0');
-        }
-        assert(yylen < yytext.size());
-        yytext[yylen++] = static_cast<char>(c);
+        token_buffer.push_back(std::char_traits<char>::to_char_type(c));
     }
 
   public:
@@ -2756,12 +3407,10 @@ scan_number_done:
         return value_float;
     }
 
-    /// return string value
-    const std::string get_string()
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t& get_string()
     {
-        // yytext cannot be returned as char*, because it may contain a null
-        // byte (parsed as "\u0000")
-        return std::string(yytext.data(), yylen);
+        return token_buffer;
     }
 
     /////////////////////
@@ -2774,28 +3423,21 @@ scan_number_done:
         return chars_read;
     }
 
-    /// return the last read token (for errors only)
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
     std::string get_token_string() const
     {
-        // get the raw byte sequence of the last token
-        std::string s = ia->read(start_pos, chars_read - start_pos);
-
         // escape control characters
         std::string result;
-        for (auto c : s)
+        for (const auto c : token_string)
         {
-	  if (c == '\0' or ((int)c) == std::char_traits<char>::eof())
-            {
-                // ignore EOF
-                continue;
-            }
-            else if ('\x00' <= c and c <= '\x1f')
+            if ('\x00' <= c and c <= '\x1F')
             {
                 // escape control characters
-                std::stringstream ss;
-                ss << "<U+" << std::setw(4) << std::uppercase << std::setfill('0')
-                   << std::hex << static_cast<int>(c) << ">";
-                result += ss.str();
+                char cs[9];
+                snprintf(cs, 9, "<U+%.4X>", static_cast<unsigned char>(c));
+                result += cs;
             }
             else
             {
@@ -2817,8 +3459,43 @@ scan_number_done:
     // actual scanner
     /////////////////////
 
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            if (get() == 0xBB and get() == 0xBF)
+            {
+                // we completely parsed the BOM
+                return true;
+            }
+            else
+            {
+                // after reading 0xEF, an unexpected character followed
+                return false;
+            }
+        }
+        else
+        {
+            // the first character is not the beginning of the BOM; unget it to
+            // process is later
+            unget();
+            return true;
+        }
+    }
+
     token_type scan()
     {
+        // initially, skip the BOM
+        if (chars_read == 0 and not skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
         // read next character and ignore whitespace
         do
         {
@@ -2886,20 +3563,19 @@ scan_number_done:
     detail::input_adapter_t ia = nullptr;
 
     /// the current character
-    int current = std::char_traits<char>::eof();
+    std::char_traits<char>::int_type current = std::char_traits<char>::eof();
 
-    /// whether get() should return the last character again
+    /// whether the next get() call should just return current
     bool next_unget = false;
 
     /// the number of characters read
     std::size_t chars_read = 0;
-    /// the start position of the current token
-    std::size_t start_pos = 0;
+
+    /// raw input token string (for error messages)
+    std::vector<char> token_string {};
 
     /// buffer for variable-length tokens (numbers, strings)
-    std::vector<char> yytext = std::vector<char>(1024, '\0');
-    /// current index in yytext
-    std::size_t yylen = 0;
+    string_t token_buffer {};
 
     /// a description of occurred lexer errors
     const char* error_message = "";
@@ -2912,6 +3588,888 @@ scan_number_done:
     /// the decimal point
     const char decimal_point_char = '.';
 };
+}
+}
+
+// #include <nlohmann/detail/input/parser.hpp>
+
+
+#include <cassert> // assert
+#include <cmath> // isfinite
+#include <cstdint> // uint8_t
+#include <functional> // function
+#include <string> // string
+#include <utility> // move
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+
+#include <cstdint> // size_t
+#include <utility> // declval
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template <typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template <typename T>
+using boolean_function_t =
+    decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template <typename T, typename Integer>
+using number_integer_function_t =
+    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template <typename T, typename Unsigned>
+using number_unsigned_function_t =
+    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template <typename T, typename Float, typename String>
+using number_float_function_t = decltype(std::declval<T&>().number_float(
+                                    std::declval<Float>(), std::declval<const String&>()));
+
+template <typename T, typename String>
+using string_function_t =
+    decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template <typename T>
+using start_object_function_t =
+    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template <typename T, typename String>
+using key_function_t =
+    decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template <typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template <typename T>
+using start_array_function_t =
+    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template <typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template <typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(
+        std::declval<std::size_t>(), std::declval<const std::string&>(),
+        std::declval<const Exception&>()));
+
+template <typename SAX, typename BasicJsonType>
+struct is_sax
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t,
+        string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template <typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
+                  "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value,
+        "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value,
+        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
+                  number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+        "Missing/invalid function: bool string(string_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+        "Missing/invalid function: bool parse_error(std::size_t, const "
+        "std::string&, const exception&)");
+};
+}
+}
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+// #include <nlohmann/detail/input/parser.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+
+namespace nlohmann
+{
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    /// type for (signed) integers
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    /// type for unsigned integers
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    /// type for floating-point numbers
+    using number_float_t = typename BasicJsonType::number_float_t;
+    /// type for strings
+    using string_t = typename BasicJsonType::string_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief an floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] error_msg   a detailed error message
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    virtual ~json_sax() = default;
+};
+
+
+namespace detail
+{
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+
+    /*!
+    @param[in, out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
+        : root(r), allow_exceptions(allow_exceptions_)
+    {}
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t&)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+        if (JSON_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+        if (JSON_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool parse_error(std::size_t, const std::string&,
+                     const detail::exception& ex)
+    {
+        errored = true;
+        if (allow_exceptions)
+        {
+            // determine the proper exception type from the id
+            switch ((ex.id / 100) % 100)
+            {
+                case 1:
+                    JSON_THROW(*reinterpret_cast<const detail::parse_error*>(&ex));
+                case 4:
+                    JSON_THROW(*reinterpret_cast<const detail::out_of_range*>(&ex));
+                // LCOV_EXCL_START
+                case 2:
+                    JSON_THROW(*reinterpret_cast<const detail::invalid_iterator*>(&ex));
+                case 3:
+                    JSON_THROW(*reinterpret_cast<const detail::type_error*>(&ex));
+                case 5:
+                    JSON_THROW(*reinterpret_cast<const detail::other_error*>(&ex));
+                default:
+                    assert(false);
+                    // LCOV_EXCL_STOP
+            }
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+            return &root;
+        }
+        else
+        {
+            assert(ref_stack.back()->is_array() or ref_stack.back()->is_object());
+            if (ref_stack.back()->is_array())
+            {
+                ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
+                return &(ref_stack.back()->m_value.array->back());
+            }
+            else
+            {
+                assert(object_element);
+                *object_element = BasicJsonType(std::forward<Value>(v));
+                return object_element;
+            }
+        }
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack;
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+template<typename BasicJsonType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 const parser_callback_t cb,
+                                 const bool allow_exceptions_ = true)
+        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t&)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        // check object limit
+        if (ref_stack.back())
+        {
+            if (JSON_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+            {
+                JSON_THROW(out_of_range::create(408,
+                                                "excessive object size: " + std::to_string(len)));
+            }
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep and ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back())
+        {
+            if (not callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+            {
+                // discard object
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        assert(not ref_stack.empty());
+        assert(not keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (not ref_stack.empty() and ref_stack.back())
+        {
+            // remove discarded value
+            if (ref_stack.back()->is_object())
+            {
+                for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+                {
+                    if (it->is_discarded())
+                    {
+                        ref_stack.back()->erase(it);
+                        break;
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        // check array limit
+        if (ref_stack.back())
+        {
+            if (JSON_UNLIKELY(len != std::size_t(-1) and len > ref_stack.back()->max_size()))
+            {
+                JSON_THROW(out_of_range::create(408,
+                                                "excessive array size: " + std::to_string(len)));
+            }
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (not keep)
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        assert(not ref_stack.empty());
+        assert(not keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (not keep and not ref_stack.empty())
+        {
+            if (ref_stack.back()->is_array())
+            {
+                ref_stack.back()->m_value.array->pop_back();
+            }
+        }
+
+        return true;
+    }
+
+    bool parse_error(std::size_t, const std::string&,
+                     const detail::exception& ex)
+    {
+        errored = true;
+        if (allow_exceptions)
+        {
+            // determine the proper exception type from the id
+            switch ((ex.id / 100) % 100)
+            {
+                case 1:
+                    JSON_THROW(*reinterpret_cast<const detail::parse_error*>(&ex));
+                case 4:
+                    JSON_THROW(*reinterpret_cast<const detail::out_of_range*>(&ex));
+                // LCOV_EXCL_START
+                case 2:
+                    JSON_THROW(*reinterpret_cast<const detail::invalid_iterator*>(&ex));
+                case 3:
+                    JSON_THROW(*reinterpret_cast<const detail::type_error*>(&ex));
+                case 5:
+                    JSON_THROW(*reinterpret_cast<const detail::other_error*>(&ex));
+                default:
+                    assert(false);
+                    // LCOV_EXCL_STOP
+            }
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        assert(not keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (not keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+        // check callback
+        const bool keep = skip_callback or callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (not keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, &root};
+        }
+        else
+        {
+            // skip this value if we already decided to skip the parent
+            // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+            if (not ref_stack.back())
+            {
+                return {false, nullptr};
+            }
+
+            assert(ref_stack.back()->is_array() or ref_stack.back()->is_object());
+            if (ref_stack.back()->is_array())
+            {
+                ref_stack.back()->m_value.array->push_back(std::move(value));
+                return {true, &(ref_stack.back()->m_value.array->back())};
+            }
+            else
+            {
+                // check if we should store an element for the current key
+                assert(not key_keep_stack.empty());
+                const bool store_element = key_keep_stack.back();
+                key_keep_stack.pop_back();
+
+                if (not store_element)
+                {
+                    return {false, nullptr};
+                }
+
+                assert(object_element);
+                *object_element = std::move(value);
+                return {true, object_element};
+            }
+        }
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack;
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack;
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack;
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t, const string_t&)
+    {
+        return true;
+    }
+
+    bool string(string_t&)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool key(string_t&)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t, const std::string&, const detail::exception&)
+    {
+        return false;
+    }
+};
+}
+
+}
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////
+// parser //
+////////////
 
 /*!
 @brief syntax analysis
@@ -2924,6 +4482,7 @@ class parser
     using number_integer_t = typename BasicJsonType::number_integer_t;
     using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
     using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
     using lexer_t = lexer<BasicJsonType>;
     using token_type = typename lexer_t::token_type;
 
@@ -2948,11 +4507,14 @@ class parser
         std::function<bool(int depth, parse_event_t event, BasicJsonType& parsed)>;
 
     /// a parser reading from an input adapter
-    explicit parser(detail::input_adapter_t adapter,
+    explicit parser(detail::input_adapter_t&& adapter,
                     const parser_callback_t cb = nullptr,
                     const bool allow_exceptions_ = true)
-        : callback(cb), m_lexer(adapter), allow_exceptions(allow_exceptions_)
-    {}
+        : callback(cb), m_lexer(std::move(adapter)), allow_exceptions(allow_exceptions_)
+    {
+        // read first token
+        get_token();
+    }
 
     /*!
     @brief public parser interface
@@ -2966,31 +4528,54 @@ class parser
     */
     void parse(const bool strict, BasicJsonType& result)
     {
-        // read first token
-        get_token();
-
-        parse_internal(true, result);
-        result.assert_invariant();
-
-        // in strict mode, input must be completely read
-        if (strict)
+        if (callback)
         {
-            get_token();
-            expect(token_type::end_of_input);
+            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict and (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input)));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded())
+            {
+                result = nullptr;
+            }
         }
-
-        // in case of an error, return discarded value
-        if (errored)
+        else
         {
-            result = value_t::discarded;
-            return;
-        }
+            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
 
-        // set top-level value to null if it was discarded by the callback
-        // function
-        if (result.is_discarded())
-        {
-            result = nullptr;
+            // in strict mode, input must be completely read
+            if (strict and (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input)));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
         }
     }
 
@@ -3002,397 +4587,311 @@ class parser
     */
     bool accept(const bool strict = true)
     {
-        // read first token
-        get_token();
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
 
-        if (not accept_internal())
+    template <typename SAX>
+    bool sax_parse(SAX* sax, const bool strict = true)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result and strict and (get_token() != token_type::end_of_input))
         {
-            return false;
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input)));
         }
 
-        // strict => last token must be EOF
-        return not strict or (get_token() == token_type::end_of_input);
+        return result;
     }
 
   private:
-    /*!
-    @brief the actual parser
-    @throw parse_error.101 in case of an unexpected token
-    @throw parse_error.102 if to_unicode fails or surrogate error
-    @throw parse_error.103 if to_unicode fails
-    */
-    void parse_internal(bool keep, BasicJsonType& result)
+    template <typename SAX>
+    bool sax_parse_internal(SAX* sax)
     {
-        // never parse after a parse error was detected
-        assert(not errored);
+        // stack to remember the hieararchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
 
-        // start with a discarded value
-        if (not result.is_discarded())
+        while (true)
         {
-            result.m_value.destroy(result.m_type);
-            result.m_type = value_t::discarded;
-        }
-
-        switch (last_token)
-        {
-            case token_type::begin_object:
+            if (not skip_to_state_evaluation)
             {
-                if (keep and (not callback or ((keep = callback(depth++, parse_event_t::object_start, result)))))
+                // invariant: get_token() was called before each iteration
+                switch (last_token)
                 {
-                    // explicitly set result to object to cope with {}
-                    result.m_type = value_t::object;
-                    result.m_value = value_t::object;
-                }
-
-                // read next token
-                get_token();
-
-                // closing } -> we are done
-                if (last_token == token_type::end_object)
-                {
-                    if (keep and callback and not callback(--depth, parse_event_t::object_end, result))
+                    case token_type::begin_object:
                     {
-                        result.m_value.destroy(result.m_type);
-                        result.m_type = value_t::discarded;
-                    }
-                    break;
-                }
-
-                // parse values
-                std::string key;
-                BasicJsonType value;
-                while (true)
-                {
-                    // store key
-                    if (not expect(token_type::value_string))
-                    {
-                        return;
-                    }
-                    key = m_lexer.get_string();
-
-                    bool keep_tag = false;
-                    if (keep)
-                    {
-                        if (callback)
+                        if (JSON_UNLIKELY(not sax->start_object(std::size_t(-1))))
                         {
-                            BasicJsonType k(key);
-                            keep_tag = callback(depth, parse_event_t::key, k);
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object)
+                        {
+                            if (JSON_UNLIKELY(not sax->end_object()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_UNLIKELY(last_token != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string)));
                         }
                         else
                         {
-                            keep_tag = true;
+                            if (JSON_UNLIKELY(not sax->key(m_lexer.get_string())))
+                            {
+                                return false;
+                            }
+                        }
+
+                        // parse separator (:)
+                        if (JSON_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator)));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array:
+                    {
+                        if (JSON_UNLIKELY(not sax->start_array(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array)
+                        {
+                            if (JSON_UNLIKELY(not sax->end_array()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float:
+                    {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_UNLIKELY(not std::isfinite(res)))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'"));
+                        }
+                        else
+                        {
+                            if (JSON_UNLIKELY(not sax->number_float(res, m_lexer.get_string())))
+                            {
+                                return false;
+                            }
+                            break;
                         }
                     }
 
-                    // parse separator (:)
-                    get_token();
-                    if (not expect(token_type::name_separator))
+                    case token_type::literal_false:
                     {
-                        return;
+                        if (JSON_UNLIKELY(not sax->boolean(false)))
+                        {
+                            return false;
+                        }
+                        break;
                     }
 
-                    // parse and add value
-                    get_token();
-                    value.m_value.destroy(value.m_type);
-                    value.m_type = value_t::discarded;
-                    parse_internal(keep, value);
-
-                    if (JSON_UNLIKELY(errored))
+                    case token_type::literal_null:
                     {
-                        return;
+                        if (JSON_UNLIKELY(not sax->null()))
+                        {
+                            return false;
+                        }
+                        break;
                     }
 
-                    if (keep and keep_tag and not value.is_discarded())
+                    case token_type::literal_true:
                     {
-                        result.m_value.object->emplace(std::move(key), std::move(value));
+                        if (JSON_UNLIKELY(not sax->boolean(true)))
+                        {
+                            return false;
+                        }
+                        break;
                     }
 
-                    // comma -> next value
-                    get_token();
-                    if (last_token == token_type::value_separator)
+                    case token_type::value_integer:
                     {
-                        get_token();
-                        continue;
+                        if (JSON_UNLIKELY(not sax->number_integer(m_lexer.get_number_integer())))
+                        {
+                            return false;
+                        }
+                        break;
                     }
 
-                    // closing }
-                    if (not expect(token_type::end_object))
+                    case token_type::value_string:
                     {
-                        return;
-                    }
-                    break;
-                }
-
-                if (keep and callback and not callback(--depth, parse_event_t::object_end, result))
-                {
-                    result.m_value.destroy(result.m_type);
-                    result.m_type = value_t::discarded;
-                }
-                break;
-            }
-
-            case token_type::begin_array:
-            {
-                if (keep and (not callback or ((keep = callback(depth++, parse_event_t::array_start, result)))))
-                {
-                    // explicitly set result to object to cope with []
-                    result.m_type = value_t::array;
-                    result.m_value = value_t::array;
-                }
-
-                // read next token
-                get_token();
-
-                // closing ] -> we are done
-                if (last_token == token_type::end_array)
-                {
-                    if (callback and not callback(--depth, parse_event_t::array_end, result))
-                    {
-                        result.m_value.destroy(result.m_type);
-                        result.m_type = value_t::discarded;
-                    }
-                    break;
-                }
-
-                // parse values
-                BasicJsonType value;
-                while (true)
-                {
-                    // parse value
-                    value.m_value.destroy(value.m_type);
-                    value.m_type = value_t::discarded;
-                    parse_internal(keep, value);
-
-                    if (JSON_UNLIKELY(errored))
-                    {
-                        return;
+                        if (JSON_UNLIKELY(not sax->string(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+                        break;
                     }
 
-                    if (keep and not value.is_discarded())
+                    case token_type::value_unsigned:
                     {
-                        result.m_value.array->push_back(std::move(value));
+                        if (JSON_UNLIKELY(not sax->number_unsigned(m_lexer.get_number_unsigned())))
+                        {
+                            return false;
+                        }
+                        break;
                     }
 
-                    // comma -> next value
-                    get_token();
-                    if (last_token == token_type::value_separator)
+                    case token_type::parse_error:
                     {
-                        get_token();
-                        continue;
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized)));
                     }
 
-                    // closing ]
-                    if (not expect(token_type::end_array))
+                    default: // the last token was unexpected
                     {
-                        return;
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value)));
                     }
-                    break;
-                }
-
-                if (keep and callback and not callback(--depth, parse_event_t::array_end, result))
-                {
-                    result.m_value.destroy(result.m_type);
-                    result.m_type = value_t::discarded;
-                }
-                break;
-            }
-
-            case token_type::literal_null:
-            {
-                result.m_type = value_t::null;
-                break;
-            }
-
-            case token_type::value_string:
-            {
-                result.m_type = value_t::string;
-                result.m_value = m_lexer.get_string();
-                break;
-            }
-
-            case token_type::literal_true:
-            {
-                result.m_type = value_t::boolean;
-                result.m_value = true;
-                break;
-            }
-
-            case token_type::literal_false:
-            {
-                result.m_type = value_t::boolean;
-                result.m_value = false;
-                break;
-            }
-
-            case token_type::value_unsigned:
-            {
-                result.m_type = value_t::number_unsigned;
-                result.m_value = m_lexer.get_number_unsigned();
-                break;
-            }
-
-            case token_type::value_integer:
-            {
-                result.m_type = value_t::number_integer;
-                result.m_value = m_lexer.get_number_integer();
-                break;
-            }
-
-            case token_type::value_float:
-            {
-                result.m_type = value_t::number_float;
-                result.m_value = m_lexer.get_number_float();
-
-                // throw in case of infinity or NAN
-                if (JSON_UNLIKELY(not std::isfinite(result.m_value.number_float)))
-                {
-                    if (allow_exceptions)
-                    {
-                        JSON_THROW(out_of_range::create(406, "number overflow parsing '" +
-                                                        m_lexer.get_token_string() + "'"));
-                    }
-                    expect(token_type::uninitialized);
-                }
-                break;
-            }
-
-            case token_type::parse_error:
-            {
-                // using "uninitialized" to avoid "expected" message
-                if (not expect(token_type::uninitialized))
-                {
-                    return;
-                }
-                break; // LCOV_EXCL_LINE
-            }
-
-            default:
-            {
-                // the last token was unexpected; we expected a value
-                if (not expect(token_type::literal_or_value))
-                {
-                    return;
-                }
-                break; // LCOV_EXCL_LINE
-            }
-        }
-
-        if (keep and callback and not callback(depth, parse_event_t::value, result))
-        {
-            result.m_type = value_t::discarded;
-        }
-    }
-
-    /*!
-    @brief the acutal acceptor
-
-    @invariant 1. The last token is not yet processed. Therefore, the caller
-                  of this function must make sure a token has been read.
-               2. When this function returns, the last token is processed.
-                  That is, the last read character was already considered.
-
-    This invariant makes sure that no token needs to be "unput".
-    */
-    bool accept_internal()
-    {
-        switch (last_token)
-        {
-            case token_type::begin_object:
-            {
-                // read next token
-                get_token();
-
-                // closing } -> we are done
-                if (last_token == token_type::end_object)
-                {
-                    return true;
-                }
-
-                // parse values
-                while (true)
-                {
-                    // parse key
-                    if (last_token != token_type::value_string)
-                    {
-                        return false;
-                    }
-
-                    // parse separator (:)
-                    get_token();
-                    if (last_token != token_type::name_separator)
-                    {
-                        return false;
-                    }
-
-                    // parse value
-                    get_token();
-                    if (not accept_internal())
-                    {
-                        return false;
-                    }
-
-                    // comma -> next value
-                    get_token();
-                    if (last_token == token_type::value_separator)
-                    {
-                        get_token();
-                        continue;
-                    }
-
-                    // closing }
-                    return (last_token == token_type::end_object);
                 }
             }
-
-            case token_type::begin_array:
+            else
             {
-                // read next token
-                get_token();
-
-                // closing ] -> we are done
-                if (last_token == token_type::end_array)
-                {
-                    return true;
-                }
-
-                // parse values
-                while (true)
-                {
-                    // parse value
-                    if (not accept_internal())
-                    {
-                        return false;
-                    }
-
-                    // comma -> next value
-                    get_token();
-                    if (last_token == token_type::value_separator)
-                    {
-                        get_token();
-                        continue;
-                    }
-
-                    // closing ]
-                    return (last_token == token_type::end_array);
-                }
+                skip_to_state_evaluation = false;
             }
 
-            case token_type::value_float:
+            // we reached this line after we successfully parsed a value
+            if (states.empty())
             {
-                // reject infinity or NAN
-                return std::isfinite(m_lexer.get_number_float());
-            }
-
-            case token_type::literal_false:
-            case token_type::literal_null:
-            case token_type::literal_true:
-            case token_type::value_integer:
-            case token_type::value_string:
-            case token_type::value_unsigned:
+                // empty stack: we reached the end of the hieararchy: done
                 return true;
+            }
+            else
+            {
+                if (states.back())  // array
+                {
+                    // comma -> next value
+                    if (get_token() == token_type::value_separator)
+                    {
+                        // parse a new value
+                        get_token();
+                        continue;
+                    }
 
-            default: // the last token was unexpected
-                return false;
+                    // closing ]
+                    if (JSON_LIKELY(last_token == token_type::end_array))
+                    {
+                        if (JSON_UNLIKELY(not sax->end_array()))
+                        {
+                            return false;
+                        }
+
+                        // We are done with this array. Before we can parse a
+                        // new value, we need to evaluate the new state first.
+                        // By setting skip_to_state_evaluation to false, we
+                        // are effectively jumping to the beginning of this if.
+                        assert(not states.empty());
+                        states.pop_back();
+                        skip_to_state_evaluation = true;
+                        continue;
+                    }
+                    else
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array)));
+                    }
+                }
+                else  // object
+                {
+                    // comma -> next value
+                    if (get_token() == token_type::value_separator)
+                    {
+                        // parse key
+                        if (JSON_UNLIKELY(get_token() != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string)));
+                        }
+                        else
+                        {
+                            if (JSON_UNLIKELY(not sax->key(m_lexer.get_string())))
+                            {
+                                return false;
+                            }
+                        }
+
+                        // parse separator (:)
+                        if (JSON_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator)));
+                        }
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    // closing }
+                    if (JSON_LIKELY(last_token == token_type::end_object))
+                    {
+                        if (JSON_UNLIKELY(not sax->end_object()))
+                        {
+                            return false;
+                        }
+
+                        // We are done with this object. Before we can parse a
+                        // new value, we need to evaluate the new state first.
+                        // By setting skip_to_state_evaluation to false, we
+                        // are effectively jumping to the beginning of this if.
+                        assert(not states.empty());
+                        states.pop_back();
+                        skip_to_state_evaluation = true;
+                        continue;
+                    }
+                    else
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object)));
+                    }
+                }
+            }
         }
     }
 
@@ -3402,29 +4901,7 @@ class parser
         return (last_token = m_lexer.scan());
     }
 
-    /*!
-    @throw parse_error.101 if expected token did not occur
-    */
-    bool expect(token_type t)
-    {
-        if (JSON_UNLIKELY(t != last_token))
-        {
-            errored = true;
-            expected = t;
-            if (allow_exceptions)
-            {
-                throw_exception();
-            }
-            else
-            {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    [[noreturn]] void throw_exception() const
+    std::string exception_message(const token_type expected)
     {
         std::string error_msg = "syntax error - ";
         if (last_token == token_type::parse_error)
@@ -3442,31 +4919,33 @@ class parser
             error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
         }
 
-        JSON_THROW(parse_error::create(101, m_lexer.get_position(), error_msg));
+        return error_msg;
     }
 
   private:
-    /// current level of recursion
-    int depth = 0;
     /// callback function
     const parser_callback_t callback = nullptr;
     /// the type of the last read token
     token_type last_token = token_type::uninitialized;
     /// the lexer
     lexer_t m_lexer;
-    /// whether a syntax error occurred
-    bool errored = false;
-    /// possible reason for the syntax error
-    token_type expected = token_type::uninitialized;
     /// whether to throw exceptions in case of errors
     const bool allow_exceptions = true;
 };
+}
+}
 
-///////////////
-// iterators //
-///////////////
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
 
-/*!
+
+#include <cstddef> // ptrdiff_t
+#include <limits>  // numeric_limits
+
+namespace nlohmann
+{
+namespace detail
+{
+/*
 @brief an iterator for primitive JSON types
 
 This class models an iterator for primitive JSON types (boolean, number,
@@ -3477,9 +4956,15 @@ end_value (`1`) models past the end.
 */
 class primitive_iterator_t
 {
-  public:
+  private:
     using difference_type = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value = begin_value + 1;
 
+    /// iterator as signed integer type
+    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+  public:
     constexpr difference_type get_value() const noexcept
     {
         return m_it;
@@ -3500,23 +4985,18 @@ class primitive_iterator_t
     /// return whether the iterator can be dereferenced
     constexpr bool is_begin() const noexcept
     {
-        return (m_it == begin_value);
+        return m_it == begin_value;
     }
 
     /// return whether the iterator is at end
     constexpr bool is_end() const noexcept
     {
-        return (m_it == end_value);
+        return m_it == end_value;
     }
 
     friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
     {
-        return (lhs.m_it == rhs.m_it);
-    }
-
-    friend constexpr bool operator!=(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return not(lhs == rhs);
+        return lhs.m_it == rhs.m_it;
     }
 
     friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
@@ -3524,25 +5004,10 @@ class primitive_iterator_t
         return lhs.m_it < rhs.m_it;
     }
 
-    friend constexpr bool operator<=(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return lhs.m_it <= rhs.m_it;
-    }
-
-    friend constexpr bool operator>(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return lhs.m_it > rhs.m_it;
-    }
-
-    friend constexpr bool operator>=(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
-    {
-        return lhs.m_it >= rhs.m_it;
-    }
-
-    primitive_iterator_t operator+(difference_type i)
+    primitive_iterator_t operator+(difference_type n) noexcept
     {
         auto result = *this;
-        result += i;
+        result += n;
         return result;
     }
 
@@ -3551,57 +5016,57 @@ class primitive_iterator_t
         return lhs.m_it - rhs.m_it;
     }
 
-    friend std::ostream& operator<<(std::ostream& os, primitive_iterator_t it)
-    {
-        return os << it.m_it;
-    }
-
-    primitive_iterator_t& operator++()
+    primitive_iterator_t& operator++() noexcept
     {
         ++m_it;
         return *this;
     }
 
-    primitive_iterator_t operator++(int)
+    primitive_iterator_t const operator++(int) noexcept
     {
         auto result = *this;
-        m_it++;
+        ++m_it;
         return result;
     }
 
-    primitive_iterator_t& operator--()
+    primitive_iterator_t& operator--() noexcept
     {
         --m_it;
         return *this;
     }
 
-    primitive_iterator_t operator--(int)
+    primitive_iterator_t const operator--(int) noexcept
     {
         auto result = *this;
-        m_it--;
+        --m_it;
         return result;
     }
 
-    primitive_iterator_t& operator+=(difference_type n)
+    primitive_iterator_t& operator+=(difference_type n) noexcept
     {
         m_it += n;
         return *this;
     }
 
-    primitive_iterator_t& operator-=(difference_type n)
+    primitive_iterator_t& operator-=(difference_type n) noexcept
     {
         m_it -= n;
         return *this;
     }
-
-  private:
-    static constexpr difference_type begin_value = 0;
-    static constexpr difference_type end_value = begin_value + 1;
-
-    /// iterator as signed integer type
-    difference_type m_it = std::numeric_limits<std::ptrdiff_t>::denorm_min();
 };
+}
+}
 
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
 /*!
 @brief an iterator value
 
@@ -3617,11 +5082,38 @@ template<typename BasicJsonType> struct internal_iterator
     /// generic iterator for all other types
     primitive_iterator_t primitive_iterator {};
 };
+}
+}
 
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+
+
+#include <ciso646> // not
+#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits> // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+// forward declare, to be able to friend it later on
 template<typename IteratorType> class iteration_proxy;
 
 /*!
-@brief a template for a random access iterator for the @ref basic_json class
+@brief a template for a bidirectional iterator for the @ref basic_json class
 
 This class implements a both iterators (iterator and const_iterator) for the
 @ref basic_json class.
@@ -3633,18 +5125,19 @@ This class implements a both iterators (iterator and const_iterator) for the
 
 @requirement The class satisfies the following concept requirements:
 -
-[RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
-  The iterator that can be moved to point (forward and backward) to any
-  element in constant time.
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
 
-@since version 1.0.0, simplified in version 2.0.9
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
 */
 template<typename BasicJsonType>
-class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJsonType>
+class iter_impl
 {
     /// allow basic_json to access private members
     friend iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
-  //    friend class BasicJsonType;
+    friend BasicJsonType;
     friend iteration_proxy<iter_impl>;
 
     using object_t = typename BasicJsonType::object_t;
@@ -3654,6 +5147,14 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
                   "iter_impl only accepts (const) basic_json");
 
   public:
+
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
     /// the type of the values when the iterator is dereferenced
     using value_type = typename BasicJsonType::value_type;
     /// a type to represent differences between iterators
@@ -3667,8 +5168,6 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
         typename std::conditional<std::is_const<BasicJsonType>::value,
         typename BasicJsonType::const_reference,
         typename BasicJsonType::reference>::type;
-    /// the category of the iterator
-    using iterator_category = std::bidirectional_iterator_tag;
 
     /// default constructor
     iter_impl() = default;
@@ -3735,7 +5234,7 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
         return *this;
     }
 
-  public:
+  private:
     /*!
     @brief set the iterator to the first value
     @pre The iterator is initialized; i.e. `m_object != nullptr`.
@@ -3879,7 +5378,7 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
     @brief post-increment (it++)
     @pre The iterator is initialized; i.e. `m_object != nullptr`.
     */
-    iter_impl operator++(int)
+    iter_impl const operator++(int)
     {
         auto result = *this;
         ++(*this);
@@ -3922,7 +5421,7 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
     @brief post-decrement (it--)
     @pre The iterator is initialized; i.e. `m_object != nullptr`.
     */
-    iter_impl operator--(int)
+    iter_impl const operator--(int)
     {
         auto result = *this;
         --(*this);
@@ -4178,7 +5677,7 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
     @brief  return the key of an object iterator
     @pre The iterator is initialized; i.e. `m_object != nullptr`.
     */
-    typename object_t::key_type key() const
+    const typename object_t::key_type& key() const
     {
         assert(m_object != nullptr);
 
@@ -4203,94 +5702,27 @@ class iter_impl : public std::iterator<std::random_access_iterator_tag, BasicJso
     /// associated JSON instance
     pointer m_object = nullptr;
     /// the actual iterator of the associated instance
-    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it = {};
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it;
 };
+}
+}
 
-/// proxy class for the iterator_wrapper functions
-template<typename IteratorType> class iteration_proxy
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <iterator> // reverse_iterator
+#include <utility> // declval
+
+namespace nlohmann
 {
-  private:
-    /// helper class for iteration
-    class iteration_proxy_internal
-    {
-      private:
-        /// the iterator
-        IteratorType anchor;
-        /// an index for arrays (used to create key names)
-        std::size_t array_index = 0;
-
-      public:
-        explicit iteration_proxy_internal(IteratorType it) noexcept : anchor(it) {}
-
-        /// dereference operator (needed for range-based for)
-        iteration_proxy_internal& operator*()
-        {
-            return *this;
-        }
-
-        /// increment operator (needed for range-based for)
-        iteration_proxy_internal& operator++()
-        {
-            ++anchor;
-            ++array_index;
-
-            return *this;
-        }
-
-        /// inequality operator (needed for range-based for)
-        bool operator!=(const iteration_proxy_internal& o) const noexcept
-        {
-            return anchor != o.anchor;
-        }
-
-        /// return key of the iterator
-        std::string key() const
-        {
-            assert(anchor.m_object != nullptr);
-
-            switch (anchor.m_object->type())
-            {
-                // use integer array index as key
-                case value_t::array:
-                    return std::to_string(array_index);
-
-                // use key from the object
-                case value_t::object:
-                    return anchor.key();
-
-                // use an empty key for all primitive types
-                default:
-                    return "";
-            }
-        }
-
-        /// return value of the iterator
-        typename IteratorType::reference value() const
-        {
-            return anchor.value();
-        }
-    };
-
-    /// the container to iterate
-    typename IteratorType::reference container;
-
-  public:
-    /// construct iteration proxy from a container
-    explicit iteration_proxy(typename IteratorType::reference cont)
-        : container(cont) {}
-
-    /// return iterator begin (needed for range-based for)
-    iteration_proxy_internal begin() noexcept
-    {
-        return iteration_proxy_internal(container.begin());
-    }
-
-    /// return iterator end (needed for range-based for)
-    iteration_proxy_internal end() noexcept
-    {
-        return iteration_proxy_internal(container.end());
-    }
-};
+namespace detail
+{
+//////////////////////
+// reverse_iterator //
+//////////////////////
 
 /*!
 @brief a template for a reverse iterator class
@@ -4301,10 +5733,10 @@ create @ref const_reverse_iterator).
 
 @requirement The class satisfies the following concept requirements:
 -
-[RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
-  The iterator that can be moved to point (forward and backward) to any
-  element in constant time.
-- [OutputIterator](http://en.cppreference.com/w/cpp/concept/OutputIterator):
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
   It is possible to write to the pointed-to element (only if @a Base is
   @ref iterator).
 
@@ -4315,20 +5747,20 @@ class json_reverse_iterator : public std::reverse_iterator<Base>
 {
   public:
     using difference_type = std::ptrdiff_t;
-    /// shortcut to the reverse iterator adaptor
+    /// shortcut to the reverse iterator adapter
     using base_iterator = std::reverse_iterator<Base>;
     /// the reference type for the pointed-to element
     using reference = typename Base::reference;
 
     /// create reverse iterator from iterator
-    json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
         : base_iterator(it) {}
 
     /// create reverse iterator from base class
-    json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
 
     /// post-increment (it++)
-    json_reverse_iterator operator++(int)
+    json_reverse_iterator const operator++(int)
     {
         return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
     }
@@ -4340,7 +5772,7 @@ class json_reverse_iterator : public std::reverse_iterator<Base>
     }
 
     /// post-decrement (it--)
-    json_reverse_iterator operator--(int)
+    json_reverse_iterator const operator--(int)
     {
         return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
     }
@@ -4395,11 +5827,25 @@ class json_reverse_iterator : public std::reverse_iterator<Base>
         return it.operator * ();
     }
 };
+}
+}
 
-/////////////////////
-// output adapters //
-/////////////////////
+// #include <nlohmann/detail/output/output_adapters.hpp>
 
+
+#include <algorithm> // copy
+#include <cstddef> // size_t
+#include <ios> // streamsize
+#include <iterator> // back_inserter
+#include <memory> // shared_ptr, make_shared
+#include <ostream> // basic_ostream
+#include <string> // basic_string
+#include <vector> // vector
+
+namespace nlohmann
+{
+namespace detail
+{
 /// abstract output adapter interface
 template<typename CharType> struct output_adapter_protocol
 {
@@ -4455,11 +5901,11 @@ class output_stream_adapter : public output_adapter_protocol<CharType>
 };
 
 /// output adapter for basic_string
-template<typename CharType>
+template<typename CharType, typename StringType = std::basic_string<CharType>>
 class output_string_adapter : public output_adapter_protocol<CharType>
 {
   public:
-    explicit output_string_adapter(std::basic_string<CharType>& s) : str(s) {}
+    explicit output_string_adapter(StringType& s) : str(s) {}
 
     void write_character(CharType c) override
     {
@@ -4472,10 +5918,10 @@ class output_string_adapter : public output_adapter_protocol<CharType>
     }
 
   private:
-    std::basic_string<CharType>& str;
+    StringType& str;
 };
 
-template<typename CharType>
+template<typename CharType, typename StringType = std::basic_string<CharType>>
 class output_adapter
 {
   public:
@@ -4485,8 +5931,8 @@ class output_adapter
     output_adapter(std::basic_ostream<CharType>& s)
         : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
 
-    output_adapter(std::basic_string<CharType>& s)
-        : oa(std::make_shared<output_string_adapter<CharType>>(s)) {}
+    output_adapter(StringType& s)
+        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
 
     operator output_adapter_t<CharType>()
     {
@@ -4496,19 +5942,57 @@ class output_adapter
   private:
     output_adapter_t<CharType> oa = nullptr;
 };
+}
+}
 
-//////////////////////////////
-// binary reader and writer //
-//////////////////////////////
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+
+#include <algorithm> // generate_n
+#include <array> // array
+#include <cassert> // assert
+#include <cmath> // ldexp
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio> // snprintf
+#include <cstring> // memcpy
+#include <iterator> // back_inserter
+#include <limits> // numeric_limits
+#include <string> // char_traits, string
+#include <utility> // make_pair, move
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary reader //
+///////////////////
 
 /*!
-@brief deserialization of CBOR and MessagePack values
+@brief deserialization of CBOR, MessagePack, and UBJSON values
 */
-template<typename BasicJsonType>
+template<typename BasicJsonType, typename SAX = json_sax_dom_parser<BasicJsonType>>
 class binary_reader
 {
     using number_integer_t = typename BasicJsonType::number_integer_t;
     using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using json_sax_t = SAX;
 
   public:
     /*!
@@ -4518,49 +6002,63 @@ class binary_reader
     */
     explicit binary_reader(input_adapter_t adapter) : ia(std::move(adapter))
     {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
         assert(ia);
     }
 
     /*!
-    @brief create a JSON value from CBOR input
-
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
     @param[in] strict  whether to expect the input to be consumed completed
-    @return JSON value created from CBOR input
 
-    @throw parse_error.110 if input ended unexpectedly or the end of file was
-                           not reached when @a strict was set to true
-    @throw parse_error.112 if unsupported byte was read
+    @return
     */
-    BasicJsonType parse_cbor(const bool strict)
+    bool sax_parse(const input_format_t format,
+                   json_sax_t* sax_,
+                   const bool strict = true)
     {
-        const auto res = parse_cbor_internal();
-        if (strict)
+        sax = sax_;
+        bool result = false;
+
+        switch (format)
         {
-            get();
-            check_eof(true);
+            case input_format_t::cbor:
+                result = parse_cbor_internal();
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+                result = parse_ubjson_internal();
+                break;
+
+            // LCOV_EXCL_START
+            default:
+                assert(false);
+                // LCOV_EXCL_STOP
         }
-        return res;
-    }
 
-    /*!
-    @brief create a JSON value from MessagePack input
-
-    @param[in] strict  whether to expect the input to be consumed completed
-    @return JSON value created from MessagePack input
-
-    @throw parse_error.110 if input ended unexpectedly or the end of file was
-                           not reached when @a strict was set to true
-    @throw parse_error.112 if unsupported byte was read
-    */
-    BasicJsonType parse_msgpack(const bool strict)
-    {
-        const auto res = parse_msgpack_internal();
-        if (strict)
+        // strict mode: next byte must be EOF
+        if (result and strict)
         {
-            get();
-            check_eof(true);
+            if (format == input_format_t::ubjson)
+            {
+                get_ignore_noop();
+            }
+            else
+            {
+                get();
+            }
+
+            if (JSON_UNLIKELY(current != std::char_traits<char>::eof()))
+            {
+                return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read, "expected end of input"));
+            }
         }
-        return res;
+
+        return result;
     }
 
     /*!
@@ -4580,14 +6078,16 @@ class binary_reader
     @param[in] get_char  whether a new character should be retrieved from the
                          input (true, default) or whether the last read
                          character should be considered instead
+
+    @return whether a valid CBOR value was passed to the SAX parser
     */
-    BasicJsonType parse_cbor_internal(const bool get_char = true)
+    bool parse_cbor_internal(const bool get_char = true)
     {
         switch (get_char ? get() : current)
         {
             // EOF
             case std::char_traits<char>::eof():
-                JSON_THROW(parse_error::create(110, chars_read, "unexpected end of input"));
+                return unexpect_eof();
 
             // Integer 0x00..0x17 (0..23)
             case 0x00:
@@ -4600,12 +6100,12 @@ class binary_reader
             case 0x07:
             case 0x08:
             case 0x09:
-            case 0x0a:
-            case 0x0b:
-            case 0x0c:
-            case 0x0d:
-            case 0x0e:
-            case 0x0f:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
             case 0x10:
             case 0x11:
             case 0x12:
@@ -4614,19 +6114,31 @@ class binary_reader
             case 0x15:
             case 0x16:
             case 0x17:
-                return static_cast<number_unsigned_t>(current);
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
 
             case 0x18: // Unsigned integer (one-byte uint8_t follows)
-                return get_number<uint8_t>();
+            {
+                uint8_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
 
             case 0x19: // Unsigned integer (two-byte uint16_t follows)
-                return get_number<uint16_t>();
+            {
+                uint16_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
 
-            case 0x1a: // Unsigned integer (four-byte uint32_t follows)
-                return get_number<uint32_t>();
+            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
+            {
+                uint32_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
 
-            case 0x1b: // Unsigned integer (eight-byte uint64_t follows)
-                return get_number<uint64_t>();
+            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
+            {
+                uint64_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
 
             // Negative integer -1-0x00..-1-0x17 (-1..-24)
             case 0x20:
@@ -4639,12 +6151,12 @@ class binary_reader
             case 0x27:
             case 0x28:
             case 0x29:
-            case 0x2a:
-            case 0x2b:
-            case 0x2c:
-            case 0x2d:
-            case 0x2e:
-            case 0x2f:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
             case 0x30:
             case 0x31:
             case 0x32:
@@ -4653,28 +6165,31 @@ class binary_reader
             case 0x35:
             case 0x36:
             case 0x37:
-                return static_cast<int8_t>(0x20 - 1 - current);
+                return sax->number_integer(static_cast<int8_t>(0x20 - 1 - current));
 
             case 0x38: // Negative integer (one-byte uint8_t follows)
             {
-                // must be uint8_t !
-                return static_cast<number_integer_t>(-1) - get_number<uint8_t>();
+                uint8_t number;
+                return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) - number);
             }
 
             case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
             {
-                return static_cast<number_integer_t>(-1) - get_number<uint16_t>();
+                uint16_t number;
+                return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) - number);
             }
 
-            case 0x3a: // Negative integer -1-n (four-byte uint32_t follows)
+            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
             {
-                return static_cast<number_integer_t>(-1) - get_number<uint32_t>();
+                uint32_t number;
+                return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1) - number);
             }
 
-            case 0x3b: // Negative integer -1-n (eight-byte uint64_t follows)
+            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
             {
-                return static_cast<number_integer_t>(-1) -
-                       static_cast<number_integer_t>(get_number<uint64_t>());
+                uint64_t number;
+                return get_number(number) and sax->number_integer(static_cast<number_integer_t>(-1)
+                        - static_cast<number_integer_t>(number));
             }
 
             // UTF-8 string (0x00..0x17 bytes follow)
@@ -4688,12 +6203,12 @@ class binary_reader
             case 0x67:
             case 0x68:
             case 0x69:
-            case 0x6a:
-            case 0x6b:
-            case 0x6c:
-            case 0x6d:
-            case 0x6e:
-            case 0x6f:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
             case 0x70:
             case 0x71:
             case 0x72:
@@ -4704,11 +6219,12 @@ class binary_reader
             case 0x77:
             case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
             case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
-            case 0x7a: // UTF-8 string (four-byte uint32_t for n follow)
-            case 0x7b: // UTF-8 string (eight-byte uint64_t for n follow)
-            case 0x7f: // UTF-8 string (indefinite length)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F: // UTF-8 string (indefinite length)
             {
-                return get_cbor_string();
+                string_t s;
+                return get_cbor_string(s) and sax->string(s);
             }
 
             // array (0x00..0x17 data items follow)
@@ -4722,12 +6238,12 @@ class binary_reader
             case 0x87:
             case 0x88:
             case 0x89:
-            case 0x8a:
-            case 0x8b:
-            case 0x8c:
-            case 0x8d:
-            case 0x8e:
-            case 0x8f:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
             case 0x90:
             case 0x91:
             case 0x92:
@@ -4736,121 +6252,110 @@ class binary_reader
             case 0x95:
             case 0x96:
             case 0x97:
-            {
-                return get_cbor_array(current & 0x1f);
-            }
+                return get_cbor_array(static_cast<std::size_t>(current & 0x1F));
 
             case 0x98: // array (one-byte uint8_t for n follows)
             {
-                return get_cbor_array(get_number<uint8_t>());
+                uint8_t len;
+                return get_number(len) and get_cbor_array(static_cast<std::size_t>(len));
             }
 
             case 0x99: // array (two-byte uint16_t for n follow)
             {
-                return get_cbor_array(get_number<uint16_t>());
+                uint16_t len;
+                return get_number(len) and get_cbor_array(static_cast<std::size_t>(len));
             }
 
-            case 0x9a: // array (four-byte uint32_t for n follow)
+            case 0x9A: // array (four-byte uint32_t for n follow)
             {
-                return get_cbor_array(get_number<uint32_t>());
+                uint32_t len;
+                return get_number(len) and get_cbor_array(static_cast<std::size_t>(len));
             }
 
-            case 0x9b: // array (eight-byte uint64_t for n follow)
+            case 0x9B: // array (eight-byte uint64_t for n follow)
             {
-                return get_cbor_array(get_number<uint64_t>());
+                uint64_t len;
+                return get_number(len) and get_cbor_array(static_cast<std::size_t>(len));
             }
 
-            case 0x9f: // array (indefinite length)
-            {
-                BasicJsonType result = value_t::array;
-                while (get() != 0xff)
-                {
-                    result.push_back(parse_cbor_internal(false));
-                }
-                return result;
-            }
+            case 0x9F: // array (indefinite length)
+                return get_cbor_array(std::size_t(-1));
 
             // map (0x00..0x17 pairs of data items follow)
-            case 0xa0:
-            case 0xa1:
-            case 0xa2:
-            case 0xa3:
-            case 0xa4:
-            case 0xa5:
-            case 0xa6:
-            case 0xa7:
-            case 0xa8:
-            case 0xa9:
-            case 0xaa:
-            case 0xab:
-            case 0xac:
-            case 0xad:
-            case 0xae:
-            case 0xaf:
-            case 0xb0:
-            case 0xb1:
-            case 0xb2:
-            case 0xb3:
-            case 0xb4:
-            case 0xb5:
-            case 0xb6:
-            case 0xb7:
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(static_cast<std::size_t>(current & 0x1F));
+
+            case 0xB8: // map (one-byte uint8_t for n follows)
             {
-                return get_cbor_object(current & 0x1f);
+                uint8_t len;
+                return get_number(len) and get_cbor_object(static_cast<std::size_t>(len));
             }
 
-            case 0xb8: // map (one-byte uint8_t for n follows)
+            case 0xB9: // map (two-byte uint16_t for n follow)
             {
-                return get_cbor_object(get_number<uint8_t>());
+                uint16_t len;
+                return get_number(len) and get_cbor_object(static_cast<std::size_t>(len));
             }
 
-            case 0xb9: // map (two-byte uint16_t for n follow)
+            case 0xBA: // map (four-byte uint32_t for n follow)
             {
-                return get_cbor_object(get_number<uint16_t>());
+                uint32_t len;
+                return get_number(len) and get_cbor_object(static_cast<std::size_t>(len));
             }
 
-            case 0xba: // map (four-byte uint32_t for n follow)
+            case 0xBB: // map (eight-byte uint64_t for n follow)
             {
-                return get_cbor_object(get_number<uint32_t>());
+                uint64_t len;
+                return get_number(len) and get_cbor_object(static_cast<std::size_t>(len));
             }
 
-            case 0xbb: // map (eight-byte uint64_t for n follow)
-            {
-                return get_cbor_object(get_number<uint64_t>());
-            }
+            case 0xBF: // map (indefinite length)
+                return get_cbor_object(std::size_t(-1));
 
-            case 0xbf: // map (indefinite length)
-            {
-                BasicJsonType result = value_t::object;
-                while (get() != 0xff)
-                {
-                    auto key = get_cbor_string();
-                    result[key] = parse_cbor_internal();
-                }
-                return result;
-            }
+            case 0xF4: // false
+                return sax->boolean(false);
 
-            case 0xf4: // false
-            {
-                return false;
-            }
+            case 0xF5: // true
+                return sax->boolean(true);
 
-            case 0xf5: // true
-            {
-                return true;
-            }
+            case 0xF6: // null
+                return sax->null();
 
-            case 0xf6: // null
-            {
-                return value_t::null;
-            }
-
-            case 0xf9: // Half-Precision Float (two-byte IEEE 754)
+            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
             {
                 const int byte1 = get();
-                check_eof();
+                if (JSON_UNLIKELY(not unexpect_eof()))
+                {
+                    return false;
+                }
                 const int byte2 = get();
-                check_eof();
+                if (JSON_UNLIKELY(not unexpect_eof()))
+                {
+                    return false;
+                }
 
                 // code from RFC 7049, Appendix D, Figure 3:
                 // As half-precision floating-point numbers were only added
@@ -4861,51 +6366,59 @@ class binary_reader
                 // half-precision floating-point numbers in the C language
                 // is shown in Fig. 3.
                 const int half = (byte1 << 8) + byte2;
-                const int exp = (half >> 10) & 0x1f;
-                const int mant = half & 0x3ff;
-                double val;
-                if (exp == 0)
+                const double val = [&half]
                 {
-                    val = std::ldexp(mant, -24);
-                }
-                else if (exp != 31)
-                {
-                    val = std::ldexp(mant + 1024, exp - 25);
-                }
-                else
-                {
-                    val = (mant == 0) ? std::numeric_limits<double>::infinity()
-                          : std::numeric_limits<double>::quiet_NaN();
-                }
-                return (half & 0x8000) != 0 ? -val : val;
+                    const int exp = (half >> 10) & 0x1F;
+                    const int mant = half & 0x3FF;
+                    assert(0 <= exp and exp <= 32);
+                    assert(0 <= mant and mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
             }
 
-            case 0xfa: // Single-Precision Float (four-byte IEEE 754)
+            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
             {
-                return get_number<float>();
+                float number;
+                return get_number(number) and sax->number_float(static_cast<number_float_t>(number), "");
             }
 
-            case 0xfb: // Double-Precision Float (eight-byte IEEE 754)
+            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
             {
-                return get_number<double>();
+                double number;
+                return get_number(number) and sax->number_float(static_cast<number_float_t>(number), "");
             }
 
             default: // anything else (0xFF is handled inside the other types)
             {
-                std::stringstream ss;
-                ss << std::setw(2) << std::setfill('0') << std::hex << current;
-                JSON_THROW(parse_error::create(112, chars_read, "error reading CBOR; last byte: 0x" + ss.str()));
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "error reading CBOR; last byte: 0x" + last_token));
             }
         }
     }
 
-    BasicJsonType parse_msgpack_internal()
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool parse_msgpack_internal()
     {
         switch (get())
         {
             // EOF
             case std::char_traits<char>::eof():
-                JSON_THROW(parse_error::create(110, chars_read, "unexpected end of input"));
+                return unexpect_eof();
 
             // positive fixint
             case 0x00:
@@ -4918,12 +6431,12 @@ class binary_reader
             case 0x07:
             case 0x08:
             case 0x09:
-            case 0x0a:
-            case 0x0b:
-            case 0x0c:
-            case 0x0d:
-            case 0x0e:
-            case 0x0f:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
             case 0x10:
             case 0x11:
             case 0x12:
@@ -4934,12 +6447,12 @@ class binary_reader
             case 0x17:
             case 0x18:
             case 0x19:
-            case 0x1a:
-            case 0x1b:
-            case 0x1c:
-            case 0x1d:
-            case 0x1e:
-            case 0x1f:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
             case 0x20:
             case 0x21:
             case 0x22:
@@ -4950,12 +6463,12 @@ class binary_reader
             case 0x27:
             case 0x28:
             case 0x29:
-            case 0x2a:
-            case 0x2b:
-            case 0x2c:
-            case 0x2d:
-            case 0x2e:
-            case 0x2f:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
             case 0x30:
             case 0x31:
             case 0x32:
@@ -4966,12 +6479,12 @@ class binary_reader
             case 0x37:
             case 0x38:
             case 0x39:
-            case 0x3a:
-            case 0x3b:
-            case 0x3c:
-            case 0x3d:
-            case 0x3e:
-            case 0x3f:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
             case 0x40:
             case 0x41:
             case 0x42:
@@ -4982,12 +6495,12 @@ class binary_reader
             case 0x47:
             case 0x48:
             case 0x49:
-            case 0x4a:
-            case 0x4b:
-            case 0x4c:
-            case 0x4d:
-            case 0x4e:
-            case 0x4f:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
             case 0x50:
             case 0x51:
             case 0x52:
@@ -4998,12 +6511,12 @@ class binary_reader
             case 0x57:
             case 0x58:
             case 0x59:
-            case 0x5a:
-            case 0x5b:
-            case 0x5c:
-            case 0x5d:
-            case 0x5e:
-            case 0x5f:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
             case 0x60:
             case 0x61:
             case 0x62:
@@ -5014,12 +6527,12 @@ class binary_reader
             case 0x67:
             case 0x68:
             case 0x69:
-            case 0x6a:
-            case 0x6b:
-            case 0x6c:
-            case 0x6d:
-            case 0x6e:
-            case 0x6f:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
             case 0x70:
             case 0x71:
             case 0x72:
@@ -5030,13 +6543,13 @@ class binary_reader
             case 0x77:
             case 0x78:
             case 0x79:
-            case 0x7a:
-            case 0x7b:
-            case 0x7c:
-            case 0x7d:
-            case 0x7e:
-            case 0x7f:
-                return static_cast<number_unsigned_t>(current);
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
 
             // fixmap
             case 0x80:
@@ -5049,15 +6562,13 @@ class binary_reader
             case 0x87:
             case 0x88:
             case 0x89:
-            case 0x8a:
-            case 0x8b:
-            case 0x8c:
-            case 0x8d:
-            case 0x8e:
-            case 0x8f:
-            {
-                return get_msgpack_object(current & 0x0f);
-            }
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(static_cast<std::size_t>(current & 0x0F));
 
             // fixarray
             case 0x90:
@@ -5070,165 +6581,213 @@ class binary_reader
             case 0x97:
             case 0x98:
             case 0x99:
-            case 0x9a:
-            case 0x9b:
-            case 0x9c:
-            case 0x9d:
-            case 0x9e:
-            case 0x9f:
-            {
-                return get_msgpack_array(current & 0x0f);
-            }
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(static_cast<std::size_t>(current & 0x0F));
 
             // fixstr
-            case 0xa0:
-            case 0xa1:
-            case 0xa2:
-            case 0xa3:
-            case 0xa4:
-            case 0xa5:
-            case 0xa6:
-            case 0xa7:
-            case 0xa8:
-            case 0xa9:
-            case 0xaa:
-            case 0xab:
-            case 0xac:
-            case 0xad:
-            case 0xae:
-            case 0xaf:
-            case 0xb0:
-            case 0xb1:
-            case 0xb2:
-            case 0xb3:
-            case 0xb4:
-            case 0xb5:
-            case 0xb6:
-            case 0xb7:
-            case 0xb8:
-            case 0xb9:
-            case 0xba:
-            case 0xbb:
-            case 0xbc:
-            case 0xbd:
-            case 0xbe:
-            case 0xbf:
-                return get_msgpack_string();
-
-            case 0xc0: // nil
-                return value_t::null;
-
-            case 0xc2: // false
-                return false;
-
-            case 0xc3: // true
-                return true;
-
-            case 0xca: // float 32
-                return get_number<float>();
-
-            case 0xcb: // float 64
-                return get_number<double>();
-
-            case 0xcc: // uint 8
-                return get_number<uint8_t>();
-
-            case 0xcd: // uint 16
-                return get_number<uint16_t>();
-
-            case 0xce: // uint 32
-                return get_number<uint32_t>();
-
-            case 0xcf: // uint 64
-                return get_number<uint64_t>();
-
-            case 0xd0: // int 8
-                return get_number<int8_t>();
-
-            case 0xd1: // int 16
-                return get_number<int16_t>();
-
-            case 0xd2: // int 32
-                return get_number<int32_t>();
-
-            case 0xd3: // int 64
-                return get_number<int64_t>();
-
-            case 0xd9: // str 8
-            case 0xda: // str 16
-            case 0xdb: // str 32
-                return get_msgpack_string();
-
-            case 0xdc: // array 16
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
             {
-                return get_msgpack_array(get_number<uint16_t>());
+                string_t s;
+                return get_msgpack_string(s) and sax->string(s);
             }
 
-            case 0xdd: // array 32
+            case 0xC0: // nil
+                return sax->null();
+
+            case 0xC2: // false
+                return sax->boolean(false);
+
+            case 0xC3: // true
+                return sax->boolean(true);
+
+            case 0xCA: // float 32
             {
-                return get_msgpack_array(get_number<uint32_t>());
+                float number;
+                return get_number(number) and sax->number_float(static_cast<number_float_t>(number), "");
             }
 
-            case 0xde: // map 16
+            case 0xCB: // float 64
             {
-                return get_msgpack_object(get_number<uint16_t>());
+                double number;
+                return get_number(number) and sax->number_float(static_cast<number_float_t>(number), "");
             }
 
-            case 0xdf: // map 32
+            case 0xCC: // uint 8
             {
-                return get_msgpack_object(get_number<uint32_t>());
+                uint8_t number;
+                return get_number(number) and sax->number_unsigned(number);
             }
 
-            // positive fixint
-            case 0xe0:
-            case 0xe1:
-            case 0xe2:
-            case 0xe3:
-            case 0xe4:
-            case 0xe5:
-            case 0xe6:
-            case 0xe7:
-            case 0xe8:
-            case 0xe9:
-            case 0xea:
-            case 0xeb:
-            case 0xec:
-            case 0xed:
-            case 0xee:
-            case 0xef:
-            case 0xf0:
-            case 0xf1:
-            case 0xf2:
-            case 0xf3:
-            case 0xf4:
-            case 0xf5:
-            case 0xf6:
-            case 0xf7:
-            case 0xf8:
-            case 0xf9:
-            case 0xfa:
-            case 0xfb:
-            case 0xfc:
-            case 0xfd:
-            case 0xfe:
-            case 0xff:
-                return static_cast<int8_t>(current);
+            case 0xCD: // uint 16
+            {
+                uint16_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
+
+            case 0xCE: // uint 32
+            {
+                uint32_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
+
+            case 0xCF: // uint 64
+            {
+                uint64_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
+
+            case 0xD0: // int 8
+            {
+                int8_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 0xD1: // int 16
+            {
+                int16_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 0xD2: // int 32
+            {
+                int32_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 0xD3: // int 64
+            {
+                int64_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 0xD9: // str 8
+            case 0xDA: // str 16
+            case 0xDB: // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) and sax->string(s);
+            }
+
+            case 0xDC: // array 16
+            {
+                uint16_t len;
+                return get_number(len) and get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD: // array 32
+            {
+                uint32_t len;
+                return get_number(len) and get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDE: // map 16
+            {
+                uint16_t len;
+                return get_number(len) and get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF: // map 32
+            {
+                uint32_t len;
+                return get_number(len) and get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<int8_t>(current));
 
             default: // anything else
             {
-                std::stringstream ss;
-                ss << std::setw(2) << std::setfill('0') << std::hex << current;
-                JSON_THROW(parse_error::create(112, chars_read,
-                                               "error reading MessagePack; last byte: 0x" + ss.str()));
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "error reading MessagePack; last byte: 0x" + last_token));
             }
         }
     }
 
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool parse_ubjson_internal(const bool get_char = true)
+    {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
     /*!
     @brief get next character from the input
 
     This function provides the interface to the used input adapter. It does
-    not throw in case the input reached EOF, but returns
+    not throw in case the input reached EOF, but returns a -'ve valued
     `std::char_traits<char>::eof()` in that case.
 
     @return character read from the input
@@ -5239,27 +6798,44 @@ class binary_reader
         return (current = ia->get_character());
     }
 
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    int get_ignore_noop()
+    {
+        do
+        {
+            get();
+        }
+        while (current == 'N');
+
+        return current;
+    }
+
     /*
     @brief read a number from the input
 
     @tparam NumberType the type of the number
+    @param[out] result  number of type @a NumberType
 
-    @return number of type @a NumberType
+    @return whether conversion completed
 
     @note This function needs to respect the system's endianess, because
-          bytes in CBOR and MessagePack are stored in network order (big
-          endian) and therefore need reordering on little endian systems.
-
-    @throw parse_error.110 if input has less than `sizeof(NumberType)` bytes
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
     */
-    template<typename NumberType> NumberType get_number()
+    template<typename NumberType>
+    bool get_number(NumberType& result)
     {
         // step 1: read input into array with system's byte order
         std::array<uint8_t, sizeof(NumberType)> vec;
         for (std::size_t i = 0; i < sizeof(NumberType); ++i)
         {
             get();
-            check_eof();
+            if (JSON_UNLIKELY(not unexpect_eof()))
+            {
+                return false;
+            }
 
             // reverse byte order prior to conversion if necessary
             if (is_little_endian)
@@ -5273,35 +6849,37 @@ class binary_reader
         }
 
         // step 2: convert array into number of type T and return
-        NumberType result;
         std::memcpy(&result, vec.data(), sizeof(NumberType));
-        return result;
+        return true;
     }
 
     /*!
     @brief create a string by reading characters from the input
 
-    @param[in] len number of bytes to read
+    @tparam NumberType the type of the number
+    @param[in] len number of characters to read
+    @param[out] string created by reading @a len bytes
+
+    @return whether string creation completed
 
     @note We can not reserve @a len bytes for the result, because @a len
-          may be too large. Usually, @ref check_eof() detects the end of
+          may be too large. Usually, @ref unexpect_eof() detects the end of
           the input before we run out of string memory.
-
-    @return string created by reading @a len bytes
-
-    @throw parse_error.110 if input has less than @a len bytes
     */
     template<typename NumberType>
-    std::string get_string(const NumberType len)
+    bool get_string(const NumberType len, string_t& result)
     {
-        std::string result;
-        std::generate_n(std::back_inserter(result), len, [this]()
+        bool success = true;
+        std::generate_n(std::back_inserter(result), len, [this, &success]()
         {
             get();
-            check_eof();
-            return current;
+            if (JSON_UNLIKELY(not unexpect_eof()))
+            {
+                success = false;
+            }
+            return static_cast<char>(current);
         });
-        return result;
+        return success;
     }
 
     /*!
@@ -5311,14 +6889,16 @@ class binary_reader
     string length and then copies this number of bytes into a string.
     Additionally, CBOR's strings with indefinite lengths are supported.
 
-    @return string
+    @param[out] result  created string
 
-    @throw parse_error.110 if input ended
-    @throw parse_error.113 if an unexpected byte is read
+    @return whether string creation completed
     */
-    std::string get_cbor_string()
+    bool get_cbor_string(string_t& result)
     {
-        check_eof();
+        if (JSON_UNLIKELY(not unexpect_eof()))
+        {
+            return false;
+        }
 
         switch (current)
         {
@@ -5333,12 +6913,12 @@ class binary_reader
             case 0x67:
             case 0x68:
             case 0x69:
-            case 0x6a:
-            case 0x6b:
-            case 0x6c:
-            case 0x6d:
-            case 0x6e:
-            case 0x6f:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
             case 0x70:
             case 0x71:
             case 0x72:
@@ -5348,74 +6928,137 @@ class binary_reader
             case 0x76:
             case 0x77:
             {
-                return get_string(current & 0x1f);
+                return get_string(current & 0x1F, result);
             }
 
             case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
             {
-                return get_string(get_number<uint8_t>());
+                uint8_t len;
+                return get_number(len) and get_string(len, result);
             }
 
             case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
             {
-                return get_string(get_number<uint16_t>());
+                uint16_t len;
+                return get_number(len) and get_string(len, result);
             }
 
-            case 0x7a: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
             {
-                return get_string(get_number<uint32_t>());
+                uint32_t len;
+                return get_number(len) and get_string(len, result);
             }
 
-            case 0x7b: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
             {
-                return get_string(get_number<uint64_t>());
+                uint64_t len;
+                return get_number(len) and get_string(len, result);
             }
 
-            case 0x7f: // UTF-8 string (indefinite length)
+            case 0x7F: // UTF-8 string (indefinite length)
             {
-                std::string result;
-                while (get() != 0xff)
+                while (get() != 0xFF)
                 {
-                    check_eof();
-                    result.push_back(static_cast<char>(current));
+                    string_t chunk;
+                    if (not get_cbor_string(chunk))
+                    {
+                        return false;
+                    }
+                    result.append(chunk);
                 }
-                return result;
+                return true;
             }
 
             default:
             {
-                std::stringstream ss;
-                ss << std::setw(2) << std::setfill('0') << std::hex << current;
-                JSON_THROW(parse_error::create(113, chars_read, "expected a CBOR string; last byte: 0x" + ss.str()));
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "expected a CBOR string; last byte: 0x" + last_token));
             }
         }
     }
 
-    template<typename NumberType>
-    BasicJsonType get_cbor_array(const NumberType len)
+    /*!
+    @param[in] len  the length of the array or std::size_t(-1) for an
+                    array of indefinite size
+    @return whether array creation completed
+    */
+    bool get_cbor_array(const std::size_t len)
     {
-        BasicJsonType result = value_t::array;
-        std::generate_n(std::back_inserter(*result.m_value.array), len, [this]()
+        if (JSON_UNLIKELY(not sax->start_array(len)))
         {
-            return parse_cbor_internal();
-        });
-        return result;
+            return false;
+        }
+
+        if (len != std::size_t(-1))
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                if (JSON_UNLIKELY(not parse_cbor_internal()))
+                {
+                    return false;
+                }
+            }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_UNLIKELY(not parse_cbor_internal(false)))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
     }
 
-    template<typename NumberType>
-    BasicJsonType get_cbor_object(const NumberType len)
+    /*!
+    @param[in] len  the length of the object or std::size_t(-1) for an
+                    object of indefinite size
+    @return whether object creation completed
+    */
+    bool get_cbor_object(const std::size_t len)
     {
-        BasicJsonType result = value_t::object;
-        std::generate_n(std::inserter(*result.m_value.object,
-                                      result.m_value.object->end()),
-                        len, [this]()
+        if (not JSON_UNLIKELY(sax->start_object(len)))
         {
-            get();
-            auto key = get_cbor_string();
-            auto val = parse_cbor_internal();
-            return std::make_pair(std::move(key), std::move(val));
-        });
-        return result;
+            return false;
+        }
+
+        string_t key;
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                get();
+                if (JSON_UNLIKELY(not get_cbor_string(key) or not sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_UNLIKELY(not parse_cbor_internal()))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_UNLIKELY(not get_cbor_string(key) or not sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_UNLIKELY(not parse_cbor_internal()))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
     }
 
     /*!
@@ -5424,126 +7067,568 @@ class binary_reader
     This function first reads starting bytes to determine the expected
     string length and then copies this number of bytes into a string.
 
-    @return string
+    @param[out] result  created string
 
-    @throw parse_error.110 if input ended
-    @throw parse_error.113 if an unexpected byte is read
+    @return whether string creation completed
     */
-    std::string get_msgpack_string()
+    bool get_msgpack_string(string_t& result)
     {
-        check_eof();
+        if (JSON_UNLIKELY(not unexpect_eof()))
+        {
+            return false;
+        }
 
         switch (current)
         {
             // fixstr
-            case 0xa0:
-            case 0xa1:
-            case 0xa2:
-            case 0xa3:
-            case 0xa4:
-            case 0xa5:
-            case 0xa6:
-            case 0xa7:
-            case 0xa8:
-            case 0xa9:
-            case 0xaa:
-            case 0xab:
-            case 0xac:
-            case 0xad:
-            case 0xae:
-            case 0xaf:
-            case 0xb0:
-            case 0xb1:
-            case 0xb2:
-            case 0xb3:
-            case 0xb4:
-            case 0xb5:
-            case 0xb6:
-            case 0xb7:
-            case 0xb8:
-            case 0xb9:
-            case 0xba:
-            case 0xbb:
-            case 0xbc:
-            case 0xbd:
-            case 0xbe:
-            case 0xbf:
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
             {
-                return get_string(current & 0x1f);
+                return get_string(current & 0x1F, result);
             }
 
-            case 0xd9: // str 8
+            case 0xD9: // str 8
             {
-                return get_string(get_number<uint8_t>());
+                uint8_t len;
+                return get_number(len) and get_string(len, result);
             }
 
-            case 0xda: // str 16
+            case 0xDA: // str 16
             {
-                return get_string(get_number<uint16_t>());
+                uint16_t len;
+                return get_number(len) and get_string(len, result);
             }
 
-            case 0xdb: // str 32
+            case 0xDB: // str 32
             {
-                return get_string(get_number<uint32_t>());
+                uint32_t len;
+                return get_number(len) and get_string(len, result);
             }
 
             default:
             {
-                std::stringstream ss;
-                ss << std::setw(2) << std::setfill('0') << std::hex << current;
-                JSON_THROW(parse_error::create(113, chars_read,
-                                               "expected a MessagePack string; last byte: 0x" + ss.str()));
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "expected a MessagePack string; last byte: 0x" + last_token));
             }
         }
     }
 
-    template<typename NumberType>
-    BasicJsonType get_msgpack_array(const NumberType len)
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool get_msgpack_array(const std::size_t len)
     {
-        BasicJsonType result = value_t::array;
-        std::generate_n(std::back_inserter(*result.m_value.array), len, [this]()
+        if (JSON_UNLIKELY(not sax->start_array(len)))
         {
-            return parse_msgpack_internal();
-        });
-        return result;
-    }
+            return false;
+        }
 
-    template<typename NumberType>
-    BasicJsonType get_msgpack_object(const NumberType len)
-    {
-        BasicJsonType result = value_t::object;
-        std::generate_n(std::inserter(*result.m_value.object,
-                                      result.m_value.object->end()),
-                        len, [this]()
+        for (std::size_t i = 0; i < len; ++i)
         {
-            get();
-            auto key = get_msgpack_string();
-            auto val = parse_msgpack_internal();
-            return std::make_pair(std::move(key), std::move(val));
-        });
-        return result;
+            if (JSON_UNLIKELY(not parse_msgpack_internal()))
+            {
+                return false;
+            }
+        }
+
+        return sax->end_array();
     }
 
     /*!
-    @brief check if input ended
-    @throw parse_error.110 if input ended
+    @param[in] len  the length of the object
+    @return whether object creation completed
     */
-    void check_eof(const bool expect_eof = false) const
+    bool get_msgpack_object(const std::size_t len)
     {
-        if (expect_eof)
+        if (JSON_UNLIKELY(not sax->start_object(len)))
         {
-            if (JSON_UNLIKELY(current != std::char_traits<char>::eof()))
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            get();
+            if (JSON_UNLIKELY(not get_msgpack_string(key) or not sax->key(key)))
             {
-                JSON_THROW(parse_error::create(110, chars_read, "expected end of input"));
+                return false;
+            }
+
+            if (JSON_UNLIKELY(not parse_msgpack_internal()))
+            {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool get_ubjson_string(string_t& result, const bool get_char = true)
+    {
+        if (get_char)
+        {
+            get();  // TODO: may we ignore N here?
+        }
+
+        if (JSON_UNLIKELY(not unexpect_eof()))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            case 'U':
+            {
+                uint8_t len;
+                return get_number(len) and get_string(len, result);
+            }
+
+            case 'i':
+            {
+                int8_t len;
+                return get_number(len) and get_string(len, result);
+            }
+
+            case 'I':
+            {
+                int16_t len;
+                return get_number(len) and get_string(len, result);
+            }
+
+            case 'l':
+            {
+                int32_t len;
+                return get_number(len) and get_string(len, result);
+            }
+
+            case 'L':
+            {
+                int64_t len;
+                return get_number(len) and get_string(len, result);
+            }
+
+            default:
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "expected a UBJSON string; last byte: 0x" + last_token));
+        }
+    }
+
+    /*!
+    @param[out] result  determined size
+    @return whether size determination completed
+    */
+    bool get_ubjson_size_value(std::size_t& result)
+    {
+        switch (get_ignore_noop())
+        {
+            case 'U':
+            {
+                uint8_t number;
+                if (JSON_UNLIKELY(not get_number(number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i':
+            {
+                int8_t number;
+                if (JSON_UNLIKELY(not get_number(number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'I':
+            {
+                int16_t number;
+                if (JSON_UNLIKELY(not get_number(number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l':
+            {
+                int32_t number;
+                if (JSON_UNLIKELY(not get_number(number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L':
+            {
+                int64_t number;
+                if (JSON_UNLIKELY(not get_number(number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "byte after '#' must denote a number type; last byte: 0x" + last_token));
+            }
+        }
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+
+    @return whether pair creation completed
+    */
+    bool get_ubjson_size_type(std::pair<std::size_t, int>& result)
+    {
+        result.first = string_t::npos; // size
+        result.second = 0; // type
+
+        get_ignore_noop();
+
+        if (current == '$')
+        {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (JSON_UNLIKELY(not unexpect_eof()))
+            {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_UNLIKELY(current != '#'))
+            {
+                if (JSON_UNLIKELY(not unexpect_eof()))
+                {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "expected '#' after UBJSON type information; last byte: 0x" + last_token));
+            }
+
+            return get_ubjson_size_value(result.first);
+        }
+        else if (current == '#')
+        {
+            return get_ubjson_size_value(result.first);
+        }
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool get_ubjson_value(const int prefix)
+    {
+        switch (prefix)
+        {
+            case std::char_traits<char>::eof():  // EOF
+                return unexpect_eof();
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'U':
+            {
+                uint8_t number;
+                return get_number(number) and sax->number_unsigned(number);
+            }
+
+            case 'i':
+            {
+                int8_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 'I':
+            {
+                int16_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 'l':
+            {
+                int32_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 'L':
+            {
+                int64_t number;
+                return get_number(number) and sax->number_integer(number);
+            }
+
+            case 'd':
+            {
+                float number;
+                return get_number(number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D':
+            {
+                double number;
+                return get_number(number) and sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_UNLIKELY(not unexpect_eof()))
+                {
+                    return false;
+                }
+                if (JSON_UNLIKELY(current > 127))
+                {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token));
+                }
+                string_t s(1, static_cast<char>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) and sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, "error reading UBJSON; last byte: 0x" + last_token));
+            }
+        }
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool get_ubjson_array()
+    {
+        std::pair<std::size_t, int> size_and_type;
+        if (JSON_UNLIKELY(not get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_UNLIKELY(not sax->start_array(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_UNLIKELY(not get_ubjson_value(size_and_type.second)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_UNLIKELY(not parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                }
             }
         }
         else
         {
-            if (JSON_UNLIKELY(current == std::char_traits<char>::eof()))
+            if (JSON_UNLIKELY(not sax->start_array(std::size_t(-1))))
             {
-                JSON_THROW(parse_error::create(110, chars_read, "unexpected end of input"));
+                return false;
+            }
+
+            while (current != ']')
+            {
+                if (JSON_UNLIKELY(not parse_ubjson_internal(false)))
+                {
+                    return false;
+                }
+                get_ignore_noop();
             }
         }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool get_ubjson_object()
+    {
+        std::pair<std::size_t, int> size_and_type;
+        if (JSON_UNLIKELY(not get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_UNLIKELY(not sax->start_object(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_UNLIKELY(not get_ubjson_string(key) or not sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_UNLIKELY(not get_ubjson_value(size_and_type.second)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_UNLIKELY(not get_ubjson_string(key) or not sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_UNLIKELY(not parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+        else
+        {
+            if (JSON_UNLIKELY(not sax->start_object(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != '}')
+            {
+                if (JSON_UNLIKELY(not get_ubjson_string(key, false) or not sax->key(key)))
+                {
+                    return false;
+                }
+                if (JSON_UNLIKELY(not parse_ubjson_internal()))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @return whether the last read character is not EOF
+    */
+    bool unexpect_eof() const
+    {
+        if (JSON_UNLIKELY(current == std::char_traits<char>::eof()))
+        {
+            return sax->parse_error(chars_read, "<end of file>", parse_error::create(110, chars_read, "unexpected end of input"));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string get_token_string() const
+    {
+        char cr[3];
+        snprintf(cr, 3, "%.2hhX", static_cast<unsigned char>(current));
+        return std::string{cr};
     }
 
   private:
@@ -5558,7 +7643,34 @@ class binary_reader
 
     /// whether we can assume little endianess
     const bool is_little_endian = little_endianess();
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
 };
+}
+}
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+
+#include <algorithm> // reverse
+#include <array> // array
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring> // memcpy
+#include <limits> // numeric_limits
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary writer //
+///////////////////
 
 /*!
 @brief serialization to CBOR and MessagePack values
@@ -5586,15 +7698,15 @@ class binary_writer
         {
             case value_t::null:
             {
-                oa->write_character(static_cast<CharType>(0xf6));
+                oa->write_character(static_cast<CharType>(0xF6));
                 break;
             }
 
             case value_t::boolean:
             {
                 oa->write_character(j.m_value.boolean
-                                    ? static_cast<CharType>(0xf5)
-                                    : static_cast<CharType>(0xf4));
+                                    ? static_cast<CharType>(0xF5)
+                                    : static_cast<CharType>(0xF4));
                 break;
             }
 
@@ -5621,12 +7733,12 @@ class binary_writer
                     }
                     else if (j.m_value.number_integer <= (std::numeric_limits<uint32_t>::max)())
                     {
-                        oa->write_character(static_cast<CharType>(0x1a));
+                        oa->write_character(static_cast<CharType>(0x1A));
                         write_number(static_cast<uint32_t>(j.m_value.number_integer));
                     }
                     else
                     {
-                        oa->write_character(static_cast<CharType>(0x1b));
+                        oa->write_character(static_cast<CharType>(0x1B));
                         write_number(static_cast<uint64_t>(j.m_value.number_integer));
                     }
                 }
@@ -5651,12 +7763,12 @@ class binary_writer
                     }
                     else if (positive_number <= (std::numeric_limits<uint32_t>::max)())
                     {
-                        oa->write_character(static_cast<CharType>(0x3a));
+                        oa->write_character(static_cast<CharType>(0x3A));
                         write_number(static_cast<uint32_t>(positive_number));
                     }
                     else
                     {
-                        oa->write_character(static_cast<CharType>(0x3b));
+                        oa->write_character(static_cast<CharType>(0x3B));
                         write_number(static_cast<uint64_t>(positive_number));
                     }
                 }
@@ -5681,20 +7793,20 @@ class binary_writer
                 }
                 else if (j.m_value.number_unsigned <= (std::numeric_limits<uint32_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0x1a));
+                    oa->write_character(static_cast<CharType>(0x1A));
                     write_number(static_cast<uint32_t>(j.m_value.number_unsigned));
                 }
                 else
                 {
-                    oa->write_character(static_cast<CharType>(0x1b));
+                    oa->write_character(static_cast<CharType>(0x1B));
                     write_number(static_cast<uint64_t>(j.m_value.number_unsigned));
                 }
                 break;
             }
 
-            case value_t::number_float: // Double-Precision Float
+            case value_t::number_float:
             {
-                oa->write_character(static_cast<CharType>(0xfb));
+                oa->write_character(get_cbor_float_prefix(j.m_value.number_float));
                 write_number(j.m_value.number_float);
                 break;
             }
@@ -5707,25 +7819,25 @@ class binary_writer
                 {
                     write_number(static_cast<uint8_t>(0x60 + N));
                 }
-                else if (N <= 0xff)
+                else if (N <= (std::numeric_limits<uint8_t>::max)())
                 {
                     oa->write_character(static_cast<CharType>(0x78));
                     write_number(static_cast<uint8_t>(N));
                 }
-                else if (N <= 0xffff)
+                else if (N <= (std::numeric_limits<uint16_t>::max)())
                 {
                     oa->write_character(static_cast<CharType>(0x79));
                     write_number(static_cast<uint16_t>(N));
                 }
-                else if (N <= 0xffffffff)
+                else if (N <= (std::numeric_limits<uint32_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0x7a));
+                    oa->write_character(static_cast<CharType>(0x7A));
                     write_number(static_cast<uint32_t>(N));
                 }
                 // LCOV_EXCL_START
-                else if (N <= 0xffffffffffffffff)
+                else if (N <= (std::numeric_limits<uint64_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0x7b));
+                    oa->write_character(static_cast<CharType>(0x7B));
                     write_number(static_cast<uint64_t>(N));
                 }
                 // LCOV_EXCL_STOP
@@ -5745,25 +7857,25 @@ class binary_writer
                 {
                     write_number(static_cast<uint8_t>(0x80 + N));
                 }
-                else if (N <= 0xff)
+                else if (N <= (std::numeric_limits<uint8_t>::max)())
                 {
                     oa->write_character(static_cast<CharType>(0x98));
                     write_number(static_cast<uint8_t>(N));
                 }
-                else if (N <= 0xffff)
+                else if (N <= (std::numeric_limits<uint16_t>::max)())
                 {
                     oa->write_character(static_cast<CharType>(0x99));
                     write_number(static_cast<uint16_t>(N));
                 }
-                else if (N <= 0xffffffff)
+                else if (N <= (std::numeric_limits<uint32_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0x9a));
+                    oa->write_character(static_cast<CharType>(0x9A));
                     write_number(static_cast<uint32_t>(N));
                 }
                 // LCOV_EXCL_START
-                else if (N <= 0xffffffffffffffff)
+                else if (N <= (std::numeric_limits<uint64_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0x9b));
+                    oa->write_character(static_cast<CharType>(0x9B));
                     write_number(static_cast<uint64_t>(N));
                 }
                 // LCOV_EXCL_STOP
@@ -5782,27 +7894,27 @@ class binary_writer
                 const auto N = j.m_value.object->size();
                 if (N <= 0x17)
                 {
-                    write_number(static_cast<uint8_t>(0xa0 + N));
+                    write_number(static_cast<uint8_t>(0xA0 + N));
                 }
-                else if (N <= 0xff)
+                else if (N <= (std::numeric_limits<uint8_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0xb8));
+                    oa->write_character(static_cast<CharType>(0xB8));
                     write_number(static_cast<uint8_t>(N));
                 }
-                else if (N <= 0xffff)
+                else if (N <= (std::numeric_limits<uint16_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0xb9));
+                    oa->write_character(static_cast<CharType>(0xB9));
                     write_number(static_cast<uint16_t>(N));
                 }
-                else if (N <= 0xffffffff)
+                else if (N <= (std::numeric_limits<uint32_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0xba));
+                    oa->write_character(static_cast<CharType>(0xBA));
                     write_number(static_cast<uint32_t>(N));
                 }
                 // LCOV_EXCL_START
-                else if (N <= 0xffffffffffffffff)
+                else if (N <= (std::numeric_limits<uint64_t>::max)())
                 {
-                    oa->write_character(static_cast<CharType>(0xbb));
+                    oa->write_character(static_cast<CharType>(0xBB));
                     write_number(static_cast<uint64_t>(N));
                 }
                 // LCOV_EXCL_STOP
@@ -5830,15 +7942,15 @@ class binary_writer
         {
             case value_t::null: // nil
             {
-                oa->write_character(static_cast<CharType>(0xc0));
+                oa->write_character(static_cast<CharType>(0xC0));
                 break;
             }
 
             case value_t::boolean: // true and false
             {
                 oa->write_character(j.m_value.boolean
-                                    ? static_cast<CharType>(0xc3)
-                                    : static_cast<CharType>(0xc2));
+                                    ? static_cast<CharType>(0xC3)
+                                    : static_cast<CharType>(0xC2));
                 break;
             }
 
@@ -5857,25 +7969,25 @@ class binary_writer
                     else if (j.m_value.number_unsigned <= (std::numeric_limits<uint8_t>::max)())
                     {
                         // uint 8
-                        oa->write_character(static_cast<CharType>(0xcc));
+                        oa->write_character(static_cast<CharType>(0xCC));
                         write_number(static_cast<uint8_t>(j.m_value.number_integer));
                     }
                     else if (j.m_value.number_unsigned <= (std::numeric_limits<uint16_t>::max)())
                     {
                         // uint 16
-                        oa->write_character(static_cast<CharType>(0xcd));
+                        oa->write_character(static_cast<CharType>(0xCD));
                         write_number(static_cast<uint16_t>(j.m_value.number_integer));
                     }
                     else if (j.m_value.number_unsigned <= (std::numeric_limits<uint32_t>::max)())
                     {
                         // uint 32
-                        oa->write_character(static_cast<CharType>(0xce));
+                        oa->write_character(static_cast<CharType>(0xCE));
                         write_number(static_cast<uint32_t>(j.m_value.number_integer));
                     }
                     else if (j.m_value.number_unsigned <= (std::numeric_limits<uint64_t>::max)())
                     {
                         // uint 64
-                        oa->write_character(static_cast<CharType>(0xcf));
+                        oa->write_character(static_cast<CharType>(0xCF));
                         write_number(static_cast<uint64_t>(j.m_value.number_integer));
                     }
                 }
@@ -5890,28 +8002,28 @@ class binary_writer
                              j.m_value.number_integer <= (std::numeric_limits<int8_t>::max)())
                     {
                         // int 8
-                        oa->write_character(static_cast<CharType>(0xd0));
+                        oa->write_character(static_cast<CharType>(0xD0));
                         write_number(static_cast<int8_t>(j.m_value.number_integer));
                     }
                     else if (j.m_value.number_integer >= (std::numeric_limits<int16_t>::min)() and
                              j.m_value.number_integer <= (std::numeric_limits<int16_t>::max)())
                     {
                         // int 16
-                        oa->write_character(static_cast<CharType>(0xd1));
+                        oa->write_character(static_cast<CharType>(0xD1));
                         write_number(static_cast<int16_t>(j.m_value.number_integer));
                     }
                     else if (j.m_value.number_integer >= (std::numeric_limits<int32_t>::min)() and
                              j.m_value.number_integer <= (std::numeric_limits<int32_t>::max)())
                     {
                         // int 32
-                        oa->write_character(static_cast<CharType>(0xd2));
+                        oa->write_character(static_cast<CharType>(0xD2));
                         write_number(static_cast<int32_t>(j.m_value.number_integer));
                     }
                     else if (j.m_value.number_integer >= (std::numeric_limits<int64_t>::min)() and
                              j.m_value.number_integer <= (std::numeric_limits<int64_t>::max)())
                     {
                         // int 64
-                        oa->write_character(static_cast<CharType>(0xd3));
+                        oa->write_character(static_cast<CharType>(0xD3));
                         write_number(static_cast<int64_t>(j.m_value.number_integer));
                     }
                 }
@@ -5928,33 +8040,33 @@ class binary_writer
                 else if (j.m_value.number_unsigned <= (std::numeric_limits<uint8_t>::max)())
                 {
                     // uint 8
-                    oa->write_character(static_cast<CharType>(0xcc));
+                    oa->write_character(static_cast<CharType>(0xCC));
                     write_number(static_cast<uint8_t>(j.m_value.number_integer));
                 }
                 else if (j.m_value.number_unsigned <= (std::numeric_limits<uint16_t>::max)())
                 {
                     // uint 16
-                    oa->write_character(static_cast<CharType>(0xcd));
+                    oa->write_character(static_cast<CharType>(0xCD));
                     write_number(static_cast<uint16_t>(j.m_value.number_integer));
                 }
                 else if (j.m_value.number_unsigned <= (std::numeric_limits<uint32_t>::max)())
                 {
                     // uint 32
-                    oa->write_character(static_cast<CharType>(0xce));
+                    oa->write_character(static_cast<CharType>(0xCE));
                     write_number(static_cast<uint32_t>(j.m_value.number_integer));
                 }
                 else if (j.m_value.number_unsigned <= (std::numeric_limits<uint64_t>::max)())
                 {
                     // uint 64
-                    oa->write_character(static_cast<CharType>(0xcf));
+                    oa->write_character(static_cast<CharType>(0xCF));
                     write_number(static_cast<uint64_t>(j.m_value.number_integer));
                 }
                 break;
             }
 
-            case value_t::number_float: // float 64
+            case value_t::number_float:
             {
-                oa->write_character(static_cast<CharType>(0xcb));
+                oa->write_character(get_msgpack_float_prefix(j.m_value.number_float));
                 write_number(j.m_value.number_float);
                 break;
             }
@@ -5966,24 +8078,24 @@ class binary_writer
                 if (N <= 31)
                 {
                     // fixstr
-                    write_number(static_cast<uint8_t>(0xa0 | N));
+                    write_number(static_cast<uint8_t>(0xA0 | N));
                 }
-                else if (N <= 255)
+                else if (N <= (std::numeric_limits<uint8_t>::max)())
                 {
                     // str 8
-                    oa->write_character(static_cast<CharType>(0xd9));
+                    oa->write_character(static_cast<CharType>(0xD9));
                     write_number(static_cast<uint8_t>(N));
                 }
-                else if (N <= 65535)
+                else if (N <= (std::numeric_limits<uint16_t>::max)())
                 {
                     // str 16
-                    oa->write_character(static_cast<CharType>(0xda));
+                    oa->write_character(static_cast<CharType>(0xDA));
                     write_number(static_cast<uint16_t>(N));
                 }
-                else if (N <= 4294967295)
+                else if (N <= (std::numeric_limits<uint32_t>::max)())
                 {
                     // str 32
-                    oa->write_character(static_cast<CharType>(0xdb));
+                    oa->write_character(static_cast<CharType>(0xDB));
                     write_number(static_cast<uint32_t>(N));
                 }
 
@@ -6003,16 +8115,16 @@ class binary_writer
                     // fixarray
                     write_number(static_cast<uint8_t>(0x90 | N));
                 }
-                else if (N <= 0xffff)
+                else if (N <= (std::numeric_limits<uint16_t>::max)())
                 {
                     // array 16
-                    oa->write_character(static_cast<CharType>(0xdc));
+                    oa->write_character(static_cast<CharType>(0xDC));
                     write_number(static_cast<uint16_t>(N));
                 }
-                else if (N <= 0xffffffff)
+                else if (N <= (std::numeric_limits<uint32_t>::max)())
                 {
                     // array 32
-                    oa->write_character(static_cast<CharType>(0xdd));
+                    oa->write_character(static_cast<CharType>(0xDD));
                     write_number(static_cast<uint32_t>(N));
                 }
 
@@ -6031,18 +8143,18 @@ class binary_writer
                 if (N <= 15)
                 {
                     // fixmap
-                    write_number(static_cast<uint8_t>(0x80 | (N & 0xf)));
+                    write_number(static_cast<uint8_t>(0x80 | (N & 0xF)));
                 }
-                else if (N <= 65535)
+                else if (N <= (std::numeric_limits<uint16_t>::max)())
                 {
                     // map 16
-                    oa->write_character(static_cast<CharType>(0xde));
+                    oa->write_character(static_cast<CharType>(0xDE));
                     write_number(static_cast<uint16_t>(N));
                 }
-                else if (N <= 4294967295)
+                else if (N <= (std::numeric_limits<uint32_t>::max)())
                 {
                     // map 32
-                    oa->write_character(static_cast<CharType>(0xdf));
+                    oa->write_character(static_cast<CharType>(0xDF));
                     write_number(static_cast<uint32_t>(N));
                 }
 
@@ -6060,6 +8172,165 @@ class binary_writer
         }
     }
 
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    */
+    void write_ubjson(const BasicJsonType& j, const bool use_count,
+                      const bool use_type, const bool add_prefix = true)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(static_cast<CharType>('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                if (add_prefix)
+                    oa->write_character(j.m_value.boolean
+                                        ? static_cast<CharType>('T')
+                                        : static_cast<CharType>('F'));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix);
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix);
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix);
+                break;
+            }
+
+            case value_t::string:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(static_cast<CharType>('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_value.string->size(), true);
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(static_cast<CharType>('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type and not j.m_value.array->empty())
+                {
+                    assert(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(static_cast<CharType>('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(static_cast<CharType>('#'));
+                    write_number_with_ubjson_prefix(j.m_value.array->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_ubjson(el, use_count, use_type, prefix_required);
+                }
+
+                if (not use_count)
+                {
+                    oa->write_character(static_cast<CharType>(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(static_cast<CharType>('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type and not j.m_value.object->empty())
+                {
+                    assert(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin(), j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(static_cast<CharType>('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(static_cast<CharType>('#'));
+                    write_number_with_ubjson_prefix(j.m_value.object->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_number_with_ubjson_prefix(el.first.size(), true);
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(el.first.c_str()),
+                        el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required);
+                }
+
+                if (not use_count)
+                {
+                    oa->write_character(static_cast<CharType>('}'));
+                }
+
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
   private:
     /*
     @brief write a number to output input
@@ -6068,10 +8339,11 @@ class binary_writer
     @tparam NumberType the type of the number
 
     @note This function needs to respect the system's endianess, because bytes
-          in CBOR and MessagePack are stored in network order (big endian) and
-          therefore need reordering on little endian systems.
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
     */
-    template<typename NumberType> void write_number(NumberType n)
+    template<typename NumberType>
+    void write_number(const NumberType n)
     {
         // step 1: write number to array of length NumberType
         std::array<CharType, sizeof(NumberType)> vec;
@@ -6087,6 +8359,240 @@ class binary_writer
         oa->write_characters(vec.data(), sizeof(NumberType));
     }
 
+    // UBJSON: write number (floating point)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (add_prefix)
+        {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_unsigned<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (n <= static_cast<uint64_t>((std::numeric_limits<int8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('i'));  // int8
+            }
+            write_number(static_cast<uint8_t>(n));
+        }
+        else if (n <= (std::numeric_limits<uint8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('U'));  // uint8
+            }
+            write_number(static_cast<uint8_t>(n));
+        }
+        else if (n <= static_cast<uint64_t>((std::numeric_limits<int16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('I'));  // int16
+            }
+            write_number(static_cast<int16_t>(n));
+        }
+        else if (n <= static_cast<uint64_t>((std::numeric_limits<int32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('l'));  // int32
+            }
+            write_number(static_cast<int32_t>(n));
+        }
+        else if (n <= static_cast<uint64_t>((std::numeric_limits<int64_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('L'));  // int64
+            }
+            write_number(static_cast<int64_t>(n));
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "number overflow serializing " + std::to_string(n)));
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_signed<NumberType>::value and
+                 not std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if ((std::numeric_limits<int8_t>::min)() <= n and n <= (std::numeric_limits<int8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('i'));  // int8
+            }
+            write_number(static_cast<int8_t>(n));
+        }
+        else if (static_cast<int64_t>((std::numeric_limits<uint8_t>::min)()) <= n and n <= static_cast<int64_t>((std::numeric_limits<uint8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('U'));  // uint8
+            }
+            write_number(static_cast<uint8_t>(n));
+        }
+        else if ((std::numeric_limits<int16_t>::min)() <= n and n <= (std::numeric_limits<int16_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('I'));  // int16
+            }
+            write_number(static_cast<int16_t>(n));
+        }
+        else if ((std::numeric_limits<int32_t>::min)() <= n and n <= (std::numeric_limits<int32_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('l'));  // int32
+            }
+            write_number(static_cast<int32_t>(n));
+        }
+        else if ((std::numeric_limits<int64_t>::min)() <= n and n <= (std::numeric_limits<int64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(static_cast<CharType>('L'));  // int64
+            }
+            write_number(static_cast<int64_t>(n));
+        }
+        // LCOV_EXCL_START
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "number overflow serializing " + std::to_string(n)));
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+
+    @note This function does not need to be 100% accurate when it comes to
+          integer limits. In case a number exceeds the limits of int64_t,
+          this will be detected by a later call to function
+          write_number_with_ubjson_prefix. Therefore, we return 'L' for any
+          value that does not fit the previous limits.
+    */
+    CharType ubjson_prefix(const BasicJsonType& j) const noexcept
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer:
+            {
+                if ((std::numeric_limits<int8_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<int8_t>::max)())
+                {
+                    return 'i';
+                }
+                else if ((std::numeric_limits<uint8_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                else if ((std::numeric_limits<int16_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<int16_t>::max)())
+                {
+                    return 'I';
+                }
+                else if ((std::numeric_limits<int32_t>::min)() <= j.m_value.number_integer and j.m_value.number_integer <= (std::numeric_limits<int32_t>::max)())
+                {
+                    return 'l';
+                }
+                else  // no check and assume int64_t (see note above)
+                {
+                    return 'L';
+                }
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= (std::numeric_limits<int8_t>::max)())
+                {
+                    return 'i';
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<int16_t>::max)())
+                {
+                    return 'I';
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<int32_t>::max)())
+                {
+                    return 'l';
+                }
+                else  // no check and assume int64_t (see note above)
+                {
+                    return 'L';
+                }
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType get_cbor_float_prefix(float)
+    {
+        return static_cast<CharType>(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType get_cbor_float_prefix(double)
+    {
+        return static_cast<CharType>(0xFB);  // Double-Precision Float
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(float)
+    {
+        return static_cast<CharType>(0xCA);  // float 32
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(double)
+    {
+        return static_cast<CharType>(0xCB);  // float 64
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(float)
+    {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(double)
+    {
+        return 'D';  // float 64
+    }
+
   private:
     /// whether we can assume little endianess
     const bool is_little_endian = binary_reader<BasicJsonType>::little_endianess();
@@ -6094,7 +8600,1137 @@ class binary_writer
     /// the output
     output_adapter_t<CharType> oa = nullptr;
 };
+}
+}
 
+// #include <nlohmann/detail/output/serializer.hpp>
+
+
+#include <algorithm> // reverse, remove, fill, find, none_of
+#include <array> // array
+#include <cassert> // assert
+#include <ciso646> // and, or
+#include <clocale> // localeconv, lconv
+#include <cmath> // labs, isfinite, isnan, signbit
+#include <cstddef> // size_t, ptrdiff_t
+#include <cstdint> // uint8_t
+#include <cstdio> // snprintf
+#include <limits> // numeric_limits
+#include <string> // string
+#include <type_traits> // is_same
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+
+
+#include <cassert> // assert
+#include <ciso646> // or, and, not
+#include <cmath>   // signbit, isfinite
+#include <cstdint> // intN_t, uintN_t
+#include <cstring> // memcpy, memmove
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl
+{
+
+template <typename Target, typename Source>
+Target reinterpret_bits(const Source source)
+{
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp // f * 2^e
+{
+    static constexpr int kPrecision = 64; // = q
+
+    uint64_t f;
+    int e;
+
+    constexpr diyfp() noexcept : f(0), e(0) {}
+    constexpr diyfp(uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
+    {
+        assert(x.e == y.e);
+        assert(x.f >= y.f);
+
+        return diyfp(x.f - y.f, x.e);
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
+    {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const uint64_t u_lo = x.f & 0xFFFFFFFF;
+        const uint64_t u_hi = x.f >> 32;
+        const uint64_t v_lo = y.f & 0xFFFFFFFF;
+        const uint64_t v_hi = y.f >> 32;
+
+        const uint64_t p0 = u_lo * v_lo;
+        const uint64_t p1 = u_lo * v_hi;
+        const uint64_t p2 = u_hi * v_lo;
+        const uint64_t p3 = u_hi * v_hi;
+
+        const uint64_t p0_hi = p0 >> 32;
+        const uint64_t p1_lo = p1 & 0xFFFFFFFF;
+        const uint64_t p1_hi = p1 >> 32;
+        const uint64_t p2_lo = p2 & 0xFFFFFFFF;
+        const uint64_t p2_hi = p2 >> 32;
+
+        uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += uint64_t{1} << (64 - 32 - 1); // round, ties up
+
+        const uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32);
+
+        return diyfp(h, x.e + y.e + 64);
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp normalize(diyfp x) noexcept
+    {
+        assert(x.f != 0);
+
+        while ((x.f >> 63) == 0)
+        {
+            x.f <<= 1;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
+    {
+        const int delta = x.e - target_exponent;
+
+        assert(delta >= 0);
+        assert(((x.f << delta) >> delta) == x.f);
+
+        return diyfp(x.f << delta, target_exponent);
+    }
+};
+
+struct boundaries
+{
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template <typename FloatType>
+boundaries compute_boundaries(FloatType value)
+{
+    assert(std::isfinite(value));
+    assert(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int      kMinExp    = 1 - kBias;
+    constexpr uint64_t kHiddenBit = uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
+
+    using bits_type = typename std::conditional< kPrecision == 24, uint32_t, uint64_t >::type;
+
+    const uint64_t bits = reinterpret_bits<bits_type>(value);
+    const uint64_t E = bits >> (kPrecision - 1);
+    const uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = (E == 0);
+    const diyfp v = is_denormal
+                    ? diyfp(F, kMinExp)
+                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = (F == 0 and E > 1);
+    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_minus = lower_boundary_is_closer
+                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
+                          : diyfp(2 * v.f - 1, v.e - 1); // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+    uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e)
+{
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exakt power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersSize = 79;
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep = 8;
+
+    static constexpr cached_power kCachedPowers[] =
+    {
+        { 0xAB70FE17C79AC6CA, -1060, -300 },
+        { 0xFF77B1FCBEBCDC4F, -1034, -292 },
+        { 0xBE5691EF416BD60C, -1007, -284 },
+        { 0x8DD01FAD907FFC3C,  -980, -276 },
+        { 0xD3515C2831559A83,  -954, -268 },
+        { 0x9D71AC8FADA6C9B5,  -927, -260 },
+        { 0xEA9C227723EE8BCB,  -901, -252 },
+        { 0xAECC49914078536D,  -874, -244 },
+        { 0x823C12795DB6CE57,  -847, -236 },
+        { 0xC21094364DFB5637,  -821, -228 },
+        { 0x9096EA6F3848984F,  -794, -220 },
+        { 0xD77485CB25823AC7,  -768, -212 },
+        { 0xA086CFCD97BF97F4,  -741, -204 },
+        { 0xEF340A98172AACE5,  -715, -196 },
+        { 0xB23867FB2A35B28E,  -688, -188 },
+        { 0x84C8D4DFD2C63F3B,  -661, -180 },
+        { 0xC5DD44271AD3CDBA,  -635, -172 },
+        { 0x936B9FCEBB25C996,  -608, -164 },
+        { 0xDBAC6C247D62A584,  -582, -156 },
+        { 0xA3AB66580D5FDAF6,  -555, -148 },
+        { 0xF3E2F893DEC3F126,  -529, -140 },
+        { 0xB5B5ADA8AAFF80B8,  -502, -132 },
+        { 0x87625F056C7C4A8B,  -475, -124 },
+        { 0xC9BCFF6034C13053,  -449, -116 },
+        { 0x964E858C91BA2655,  -422, -108 },
+        { 0xDFF9772470297EBD,  -396, -100 },
+        { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
+        { 0xF8A95FCF88747D94,  -343,  -84 },
+        { 0xB94470938FA89BCF,  -316,  -76 },
+        { 0x8A08F0F8BF0F156B,  -289,  -68 },
+        { 0xCDB02555653131B6,  -263,  -60 },
+        { 0x993FE2C6D07B7FAC,  -236,  -52 },
+        { 0xE45C10C42A2B3B06,  -210,  -44 },
+        { 0xAA242499697392D3,  -183,  -36 },
+        { 0xFD87B5F28300CA0E,  -157,  -28 },
+        { 0xBCE5086492111AEB,  -130,  -20 },
+        { 0x8CBCCC096F5088CC,  -103,  -12 },
+        { 0xD1B71758E219652C,   -77,   -4 },
+        { 0x9C40000000000000,   -50,    4 },
+        { 0xE8D4A51000000000,   -24,   12 },
+        { 0xAD78EBC5AC620000,     3,   20 },
+        { 0x813F3978F8940984,    30,   28 },
+        { 0xC097CE7BC90715B3,    56,   36 },
+        { 0x8F7E32CE7BEA5C70,    83,   44 },
+        { 0xD5D238A4ABE98068,   109,   52 },
+        { 0x9F4F2726179A2245,   136,   60 },
+        { 0xED63A231D4C4FB27,   162,   68 },
+        { 0xB0DE65388CC8ADA8,   189,   76 },
+        { 0x83C7088E1AAB65DB,   216,   84 },
+        { 0xC45D1DF942711D9A,   242,   92 },
+        { 0x924D692CA61BE758,   269,  100 },
+        { 0xDA01EE641A708DEA,   295,  108 },
+        { 0xA26DA3999AEF774A,   322,  116 },
+        { 0xF209787BB47D6B85,   348,  124 },
+        { 0xB454E4A179DD1877,   375,  132 },
+        { 0x865B86925B9BC5C2,   402,  140 },
+        { 0xC83553C5C8965D3D,   428,  148 },
+        { 0x952AB45CFA97A0B3,   455,  156 },
+        { 0xDE469FBD99A05FE3,   481,  164 },
+        { 0xA59BC234DB398C25,   508,  172 },
+        { 0xF6C69A72A3989F5C,   534,  180 },
+        { 0xB7DCBF5354E9BECE,   561,  188 },
+        { 0x88FCF317F22241E2,   588,  196 },
+        { 0xCC20CE9BD35C78A5,   614,  204 },
+        { 0x98165AF37B2153DF,   641,  212 },
+        { 0xE2A0B5DC971F303A,   667,  220 },
+        { 0xA8D9D1535CE3B396,   694,  228 },
+        { 0xFB9B7CD9A4A7443C,   720,  236 },
+        { 0xBB764C4CA7A44410,   747,  244 },
+        { 0x8BAB8EEFB6409C1A,   774,  252 },
+        { 0xD01FEF10A657842C,   800,  260 },
+        { 0x9B10A4E5E9913129,   827,  268 },
+        { 0xE7109BFBA19C0C9D,   853,  276 },
+        { 0xAC2820D9623BF429,   880,  284 },
+        { 0x80444B5E7AA7CF85,   907,  292 },
+        { 0xBF21E44003ACDD2D,   933,  300 },
+        { 0x8E679C2F5E44FF8F,   960,  308 },
+        { 0xD433179D9C8CB841,   986,  316 },
+        { 0x9E19DB92B4E31BA9,  1013,  324 },
+    };
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    assert(e >= -1500);
+    assert(e <=  1500);
+    const int f = kAlpha - e - 1;
+    const int k = (f * 78913) / (1 << 18) + (f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    assert(index >= 0);
+    assert(index < kCachedPowersSize);
+    static_cast<void>(kCachedPowersSize); // Fix warning.
+
+    const cached_power cached = kCachedPowers[index];
+    assert(kAlpha <= cached.e + e + 64);
+    assert(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const uint32_t n, uint32_t& pow10)
+{
+    // LCOV_EXCL_START
+    if (n >= 1000000000)
+    {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    else if (n >= 100000000)
+    {
+        pow10 = 100000000;
+        return  9;
+    }
+    else if (n >= 10000000)
+    {
+        pow10 = 10000000;
+        return  8;
+    }
+    else if (n >= 1000000)
+    {
+        pow10 = 1000000;
+        return  7;
+    }
+    else if (n >= 100000)
+    {
+        pow10 = 100000;
+        return  6;
+    }
+    else if (n >= 10000)
+    {
+        pow10 = 10000;
+        return  5;
+    }
+    else if (n >= 1000)
+    {
+        pow10 = 1000;
+        return  4;
+    }
+    else if (n >= 100)
+    {
+        pow10 = 100;
+        return  3;
+    }
+    else if (n >= 10)
+    {
+        pow10 = 10;
+        return  2;
+    }
+    else
+    {
+        pow10 = 1;
+        return 1;
+    }
+}
+
+inline void grisu2_round(char* buf, int len, uint64_t dist, uint64_t delta,
+                         uint64_t rest, uint64_t ten_k)
+{
+    assert(len >= 1);
+    assert(dist <= delta);
+    assert(rest <= delta);
+    assert(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist
+            and delta - rest >= ten_k
+            and (rest + ten_k < dist or dist - rest > rest + ten_k - dist))
+    {
+        assert(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus)
+{
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    assert(M_plus.e >= kAlpha);
+    assert(M_plus.e <= kGamma);
+
+    uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
+    uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(uint64_t{1} << -M_plus.e, M_plus.e);
+
+    uint32_t p1 = static_cast<uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    assert(p1 > 0);
+
+    uint32_t pow10;
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0)
+    {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        assert(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const uint64_t rest = (uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta)
+        {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const uint64_t ten_n = uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    assert(p2 > delta);
+
+    int m = 0;
+    for (;;)
+    {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        assert(p2 <= UINT64_MAX / 10);
+        p2 *= 10;
+        const uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+        const uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        assert(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist  *= 10;
+        if (p2 <= delta)
+        {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+inline void grisu2(char* buf, int& len, int& decimal_exponent,
+                   diyfp m_minus, diyfp v, diyfp m_plus)
+{
+    assert(m_plus.e == m_minus.e);
+    assert(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v,       c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
+
+    decimal_exponent = -cached.k; // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template <typename FloatType>
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
+{
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    assert(std::isfinite(value));
+    assert(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+inline char* append_exponent(char* buf, int e)
+{
+    assert(e > -1000);
+    assert(e <  1000);
+
+    if (e < 0)
+    {
+        e = -e;
+        *buf++ = '-';
+    }
+    else
+    {
+        *buf++ = '+';
+    }
+
+    uint32_t k = static_cast<uint32_t>(e);
+    if (k < 10)
+    {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else if (k < 100)
+    {
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else
+    {
+        *buf++ = static_cast<char>('0' + k / 100);
+        k %= 100;
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+inline char* format_buffer(char* buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp)
+{
+    assert(min_exp < 0);
+    assert(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n and n <= max_exp)
+    {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n - k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (n + 2);
+    }
+
+    if (0 < n and n <= max_exp)
+    {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        assert(k > n);
+
+        std::memmove(buf + (n + 1), buf + n, static_cast<size_t>(k - n));
+        buf[n] = '.';
+        return buf + (k + 1);
+    }
+
+    if (min_exp < n and n <= 0)
+    {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + -n), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2 + (-n) + k);
+    }
+
+    if (k == 1)
+    {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    }
+    else
+    {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k - 1));
+        buf[1] = '.';
+        buf += 1 + k;
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+} // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template <typename FloatType>
+char* to_chars(char* first, char* last, FloatType value)
+{
+    static_cast<void>(last); // maybe unused - fix warning
+    assert(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value))
+    {
+        value = -value;
+        *first++ = '-';
+    }
+
+    if (value == 0) // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+
+    assert(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    assert(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    assert(last - first >= kMaxExp + 2);
+    assert(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    assert(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
 ///////////////////
 // serialization //
 ///////////////////
@@ -6106,6 +9742,9 @@ class serializer
     using number_float_t = typename BasicJsonType::number_float_t;
     using number_integer_t = typename BasicJsonType::number_integer_t;
     using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    static constexpr uint8_t UTF8_ACCEPT = 0;
+    static constexpr uint8_t UTF8_REJECT = 1;
+
   public:
     /*!
     @param[in] s  output stream to serialize to
@@ -6113,9 +9752,10 @@ class serializer
     */
     serializer(output_adapter_t<char> s, const char ichar)
         : o(std::move(s)), loc(std::localeconv()),
-          thousands_sep(loc->thousands_sep == nullptr ? '\0' : loc->thousands_sep[0]),
-          decimal_point(loc->decimal_point == nullptr ? '\0' : loc->decimal_point[0]),
-          indent_char(ichar), indent_string(512, indent_char) {}
+          thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
+          decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
+          indent_char(ichar), indent_string(512, indent_char)
+    {}
 
     // delete because of pointer members
     serializer(const serializer&) = delete;
@@ -6331,175 +9971,6 @@ class serializer
     }
 
   private:
-    /*!
-    @brief returns the number of expected bytes following in UTF-8 string
-
-    @param[in]  u  the first byte of a UTF-8 string
-    @return  the number of expected bytes following
-    */
-    static constexpr std::size_t bytes_following(const uint8_t u)
-    {
-        return ((0 <= u and u <= 127) ? 0
-                : ((192 <= u and u <= 223) ? 1
-                   : ((224 <= u and u <= 239) ? 2
-                      : ((240 <= u and u <= 247) ? 3 : std::string::npos))));
-    }
-
-    /*!
-    @brief calculates the extra space to escape a JSON string
-
-    @param[in] s  the string to escape
-    @param[in] ensure_ascii  whether to escape non-ASCII characters with
-                             \uXXXX sequences
-    @return the number of characters required to escape string @a s
-
-    @complexity Linear in the length of string @a s.
-    */
-    static std::size_t extra_space(const string_t& s,
-                                   const bool ensure_ascii) noexcept
-    {
-        std::size_t res = 0;
-
-        for (std::size_t i = 0; i < s.size(); ++i)
-        {
-            switch (s[i])
-            {
-                // control characters that can be escaped with a backslash
-                case '"':
-                case '\\':
-                case '\b':
-                case '\f':
-                case '\n':
-                case '\r':
-                case '\t':
-                {
-                    // from c (1 byte) to \x (2 bytes)
-                    res += 1;
-                    break;
-                }
-
-                // control characters that need \uxxxx escaping
-                case 0x00:
-                case 0x01:
-                case 0x02:
-                case 0x03:
-                case 0x04:
-                case 0x05:
-                case 0x06:
-                case 0x07:
-                case 0x0b:
-                case 0x0e:
-                case 0x0f:
-                case 0x10:
-                case 0x11:
-                case 0x12:
-                case 0x13:
-                case 0x14:
-                case 0x15:
-                case 0x16:
-                case 0x17:
-                case 0x18:
-                case 0x19:
-                case 0x1a:
-                case 0x1b:
-                case 0x1c:
-                case 0x1d:
-                case 0x1e:
-                case 0x1f:
-                {
-                    // from c (1 byte) to \uxxxx (6 bytes)
-                    res += 5;
-                    break;
-                }
-
-                default:
-                {
-                    if (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F))
-                    {
-                        const auto bytes = bytes_following(static_cast<uint8_t>(s[i]));
-                        if (bytes == std::string::npos)
-                        {
-                            // invalid characters are treated as is, so no
-                            // additional space will be used
-                            break;
-                        }
-
-                        if (bytes == 3)
-                        {
-                            // codepoints that need 4 bytes (i.e., 3 additional
-                            // bytes) in UTF-8 need a surrogate pair when \u
-                            // escaping is used: from 4 bytes to \uxxxx\uxxxx
-                            // (12 bytes)
-                            res += (12 - bytes - 1);
-                        }
-                        else
-                        {
-                            // from x bytes to \uxxxx (6 bytes)
-                            res += (6 - bytes - 1);
-                        }
-
-                        // skip the additional bytes
-                        i += bytes;
-                    }
-                    break;
-                }
-            }
-        }
-
-        return res;
-    }
-
-    static void escape_codepoint(int codepoint, string_t& result, std::size_t& pos)
-    {
-        // expecting a proper codepoint
-        assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
-
-        // the last written character was the backslash before the 'u'
-        assert(result[pos] == '\\');
-
-        // write the 'u'
-        result[++pos] = 'u';
-
-        // convert a number 0..15 to its hex representation (0..f)
-        static const std::array<char, 16> hexify =
-        {
-            {
-                '0', '1', '2', '3', '4', '5', '6', '7',
-                '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
-            }
-        };
-
-        if (codepoint < 0x10000)
-        {
-            // codepoints U+0000..U+FFFF can be represented as \uxxxx.
-            result[++pos] = hexify[(codepoint >> 12) & 0x0F];
-            result[++pos] = hexify[(codepoint >> 8) & 0x0F];
-            result[++pos] = hexify[(codepoint >> 4) & 0x0F];
-            result[++pos] = hexify[codepoint & 0x0F];
-        }
-        else
-        {
-            // codepoints U+10000..U+10FFFF need a surrogate pair to be
-            // represented as \uxxxx\uxxxx.
-            // http://www.unicode.org/faq/utf_bom.html#utf16-4
-            codepoint -= 0x10000;
-            const int high_surrogate = 0xD800 | ((codepoint >> 10) & 0x3FF);
-            const int low_surrogate = 0xDC00 | (codepoint & 0x3FF);
-            result[++pos] = hexify[(high_surrogate >> 12) & 0x0F];
-            result[++pos] = hexify[(high_surrogate >> 8) & 0x0F];
-            result[++pos] = hexify[(high_surrogate >> 4) & 0x0F];
-            result[++pos] = hexify[high_surrogate & 0x0F];
-            ++pos;  // backslash is already in output
-            result[++pos] = 'u';
-            result[++pos] = hexify[(low_surrogate >> 12) & 0x0F];
-            result[++pos] = hexify[(low_surrogate >> 8) & 0x0F];
-            result[++pos] = hexify[(low_surrogate >> 4) & 0x0F];
-            result[++pos] = hexify[low_surrogate & 0x0F];
-        }
-
-        ++pos;
-    }
-
     /*!
     @brief dump escaped string
 
@@ -6514,146 +9985,146 @@ class serializer
 
     @complexity Linear in the length of string @a s.
     */
-    void dump_escaped(const string_t& s, const bool ensure_ascii) const
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
     {
-        const auto space = extra_space(s, ensure_ascii);
-        if (space == 0)
-        {
-            o->write_characters(s.c_str(), s.size());
-            return;
-        }
-
-        // create a result string of necessary size
-        string_t result(s.size() + space, '\\');
-        std::size_t pos = 0;
+        uint32_t codepoint;
+        uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes = 0;  // number of bytes written to string_buffer
 
         for (std::size_t i = 0; i < s.size(); ++i)
         {
-            switch (s[i])
+            const auto byte = static_cast<uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte))
             {
-                case '"': // quotation mark (0x22)
+                case UTF8_ACCEPT:  // decode found a new code point
                 {
-                    result[pos + 1] = '"';
-                    pos += 2;
-                    break;
-                }
-
-                case '\\': // reverse solidus (0x5c)
-                {
-                    // nothing to change
-                    pos += 2;
-                    break;
-                }
-
-                case '\b': // backspace (0x08)
-                {
-                    result[pos + 1] = 'b';
-                    pos += 2;
-                    break;
-                }
-
-                case '\f': // formfeed (0x0c)
-                {
-                    result[pos + 1] = 'f';
-                    pos += 2;
-                    break;
-                }
-
-                case '\n': // newline (0x0a)
-                {
-                    result[pos + 1] = 'n';
-                    pos += 2;
-                    break;
-                }
-
-                case '\r': // carriage return (0x0d)
-                {
-                    result[pos + 1] = 'r';
-                    pos += 2;
-                    break;
-                }
-
-                case '\t': // horizontal tab (0x09)
-                {
-                    result[pos + 1] = 't';
-                    pos += 2;
-                    break;
-                }
-
-                default:
-                {
-                    // escape control characters (0x00..0x1F) or, if
-                    // ensure_ascii parameter is used, non-ASCII characters
-                    if ((0x00 <= s[i] and s[i] <= 0x1F) or
-                            (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F)))
+                    switch (codepoint)
                     {
-                        const auto bytes = bytes_following(static_cast<uint8_t>(s[i]));
-                        if (bytes == std::string::npos)
+                        case 0x08: // backspace
                         {
-                            // copy invalid character as is
-                            result[pos++] = s[i];
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
                             break;
                         }
 
-                        // check that the additional bytes are present
-                        assert(i + bytes < s.size());
-
-                        // to use \uxxxx escaping, we first need to caluclate
-                        // the codepoint from the UTF-8 bytes
-                        int codepoint = 0;
-
-                        assert(0 <= bytes and bytes <= 3);
-                        switch (bytes)
+                        case 0x09: // horizontal tab
                         {
-                            case 0:
-                            {
-                                codepoint = s[i] & 0xFF;
-                                break;
-                            }
-
-                            case 1:
-                            {
-                                codepoint = ((s[i] & 0x3F) << 6)
-                                            + (s[i + 1] & 0x7F);
-                                break;
-                            }
-
-                            case 2:
-                            {
-                                codepoint = ((s[i] & 0x1F) << 12)
-                                            + ((s[i + 1] & 0x7F) << 6)
-                                            + (s[i + 2] & 0x7F);
-                                break;
-                            }
-
-                            case 3:
-                            {
-                                codepoint = ((s[i] & 0xF) << 18)
-                                            + ((s[i + 1] & 0x7F) << 12)
-                                            + ((s[i + 2] & 0x7F) << 6)
-                                            + (s[i + 3] & 0x7F);
-                                break;
-                            }
-
-                            default:
-                                break;  // LCOV_EXCL_LINE
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
                         }
 
-                        escape_codepoint(codepoint, result, pos);
-                        i += bytes;
+                        case 0x0A: // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C: // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D: // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22: // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C: // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default:
+                        {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) or (ensure_ascii and (codepoint >= 0x7F)))
+                            {
+                                if (codepoint <= 0xFFFF)
+                                {
+                                    std::snprintf(string_buffer.data() + bytes, 7, "\\u%04x",
+                                                  static_cast<uint16_t>(codepoint));
+                                    bytes += 6;
+                                }
+                                else
+                                {
+                                    std::snprintf(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
+                                                  static_cast<uint16_t>(0xD7C0 + (codepoint >> 10)),
+                                                  static_cast<uint16_t>(0xDC00 + (codepoint & 0x3FF)));
+                                    bytes += 12;
+                                }
+                            }
+                            else
+                            {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
                     }
-                    else
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13)
                     {
-                        // all other characters are added as-is
-                        result[pos++] = s[i];
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    std::string sn(3, '\0');
+                    snprintf(&sn[0], sn.size(), "%.2X", byte);
+                    JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (not ensure_ascii)
+                    {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
                     }
                     break;
                 }
             }
         }
 
-        assert(pos == result.size());
-        o->write_characters(result.c_str(), result.size());
+        if (JSON_LIKELY(state == UTF8_ACCEPT))
+        {
+            // write buffer
+            if (bytes > 0)
+            {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        }
+        else
+        {
+            // we finish reading, but do not accept: string was incomplete
+            std::string sn(3, '\0');
+            snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
+            JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+        }
     }
 
     /*!
@@ -6665,11 +10136,10 @@ class serializer
     @param[in] x  integer number (signed or unsigned) to dump
     @tparam NumberType either @a number_integer_t or @a number_unsigned_t
     */
-    template <
-        typename NumberType,
-        detail::enable_if_t<std::is_same<NumberType, number_unsigned_t>::value or
-                            std::is_same<NumberType, number_integer_t>::value,
-                            int> = 0 >
+    template<typename NumberType, detail::enable_if_t<
+                 std::is_same<NumberType, number_unsigned_t>::value or
+                 std::is_same<NumberType, number_integer_t>::value,
+                 int> = 0>
     void dump_integer(NumberType x)
     {
         // special case for "0"
@@ -6679,20 +10149,19 @@ class serializer
             return;
         }
 
-        const bool is_negative = x < 0;
+        const bool is_negative = not (x >= 0);  // see issue #755
         std::size_t i = 0;
 
-        // spare 1 byte for '\0'
-        while (x != 0 and i < number_buffer.size() - 1)
+        while (x != 0)
         {
+            // spare 1 byte for '\0'
+            assert(i < number_buffer.size() - 1);
+
             const auto digit = std::labs(static_cast<long>(x % 10));
             number_buffer[i++] = static_cast<char>('0' + digit);
             x /= 10;
         }
 
-        // make sure the number has been processed completely
-        assert(x == 0);
-
         if (is_negative)
         {
             // make sure there is capacity for the '-'
@@ -6715,28 +10184,36 @@ class serializer
     void dump_float(number_float_t x)
     {
         // NaN / inf
-        if (not std::isfinite(x) or std::isnan(x))
+        if (not std::isfinite(x))
         {
             o->write_characters("null", 4);
             return;
         }
 
-        // special case for 0.0 and -0.0
-        if (x == 0)
-        {
-            if (std::signbit(x))
-            {
-                o->write_characters("-0.0", 4);
-            }
-            else
-            {
-                o->write_characters("0.0", 3);
-            }
-            return;
-        }
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double
+            = (std::numeric_limits<number_float_t>::is_iec559 and std::numeric_limits<number_float_t>::digits == 24 and std::numeric_limits<number_float_t>::max_exponent == 128) or
+              (std::numeric_limits<number_float_t>::is_iec559 and std::numeric_limits<number_float_t>::digits == 53 and std::numeric_limits<number_float_t>::max_exponent == 1024);
 
-        // get number of digits for a text -> float -> text round-trip
-        static constexpr auto d = std::numeric_limits<number_float_t>::digits10;
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
+    {
+        char* begin = number_buffer.data();
+        char* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
+    {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
 
         // the actual conversion
         std::ptrdiff_t len = snprintf(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
@@ -6782,6 +10259,59 @@ class serializer
         }
     }
 
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static uint8_t decode(uint8_t& state, uint32_t& codep, const uint8_t byte) noexcept
+    {
+        static const std::array<uint8_t, 400> utf8d =
+        {
+            {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+            }
+        };
+
+        const uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT)
+                ? (byte & 0x3fu) | (codep << 6)
+                : static_cast<uint32_t>(0xff >> type) & (byte);
+
+        state = utf8d[256u + state * 16u + type];
+        return state;
+    }
+
   private:
     /// the output of the serializer
     output_adapter_t<char> o = nullptr;
@@ -6796,13 +10326,27 @@ class serializer
     /// the locale's decimal point character
     const char decimal_point = '\0';
 
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
     /// the indentation character
     const char indent_char;
-
     /// the indentation string
     string_t indent_string;
 };
+}
+}
 
+// #include <nlohmann/detail/json_ref.hpp>
+
+
+#include <initializer_list>
+#include <utility>
+
+namespace nlohmann
+{
+namespace detail
+{
 template<typename BasicJsonType>
 class json_ref
 {
@@ -6810,27 +10354,20 @@ class json_ref
     using value_type = BasicJsonType;
 
     json_ref(value_type&& value)
-        : owned_value(std::move(value)),
-          value_ref(&owned_value),
-          is_rvalue(true)
+        : owned_value(std::move(value)), value_ref(&owned_value), is_rvalue(true)
     {}
 
     json_ref(const value_type& value)
-        : value_ref(const_cast<value_type*>(&value)),
-          is_rvalue(false)
+        : value_ref(const_cast<value_type*>(&value)), is_rvalue(false)
     {}
 
     json_ref(std::initializer_list<json_ref> init)
-        : owned_value(init),
-          value_ref(&owned_value),
-          is_rvalue(true)
+        : owned_value(init), value_ref(&owned_value), is_rvalue(true)
     {}
 
-    template <class... Args>
-    json_ref(Args... args)
-        : owned_value(std::forward<Args>(args)...),
-          value_ref(&owned_value),
-          is_rvalue(true)
+    template<class... Args>
+    json_ref(Args&& ... args)
+        : owned_value(std::forward<Args>(args)...), value_ref(&owned_value), is_rvalue(true)
     {}
 
     // class should be movable only
@@ -6862,74 +10399,30 @@ class json_ref
     value_type* value_ref = nullptr;
     const bool is_rvalue;
 };
-
-} // namespace detail
-
-/// namespace to hold default `to_json` / `from_json` functions
-namespace
-{
-constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value;
-constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value;
+}
 }
 
+// #include <nlohmann/detail/json_pointer.hpp>
 
-/*!
-@brief default JSONSerializer template argument
 
-This serializer ignores the template arguments and uses ADL
-([argument-dependent lookup](http://en.cppreference.com/w/cpp/language/adl))
-for serialization.
-*/
-template<typename, typename>
-struct adl_serializer
+#include <cassert> // assert
+#include <numeric> // accumulate
+#include <string> // string
+#include <vector> // vector
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
 {
-    /*!
-    @brief convert a JSON value to any value type
-
-    This function is usually called by the `get()` function of the
-    @ref basic_json class (either explicit or via conversion operators).
-
-    @param[in] j         JSON value to read from
-    @param[in,out] val  value to write to
-    */
-    template<typename BasicJsonType, typename ValueType>
-    static void from_json(BasicJsonType&& j, ValueType& val) noexcept(
-        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
-    {
-        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
-    }
-
-    /*!
-    @brief convert any value type to a JSON value
-
-    This function is usually called by the constructors of the @ref basic_json
-    class.
-
-    @param[in,out] j  JSON value to write to
-    @param[in] val     value to read from
-    */
-    template<typename BasicJsonType, typename ValueType>
-    static void to_json(BasicJsonType& j, ValueType&& val) noexcept(
-        noexcept(::nlohmann::to_json(j, std::forward<ValueType>(val))))
-    {
-        ::nlohmann::to_json(j, std::forward<ValueType>(val));
-    }
-};
-
-/*!
-@brief JSON Pointer
-
-A JSON pointer defines a string syntax for identifying a specific value
-within a JSON document. It can be used with functions `at` and
-`operator[]`. Furthermore, JSON pointers are the base for JSON patches.
-
-@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)
-
-@since version 2.0.0
-*/
+template<typename BasicJsonType>
 class json_pointer
 {
-    /// allow basic_json to access private members
+    // allow basic_json to access private members
     NLOHMANN_BASIC_JSON_TPL_DECLARATION
     friend class basic_json;
 
@@ -6943,19 +10436,21 @@ class json_pointer
     @param[in] s  string representing the JSON pointer; if omitted, the empty
                   string is assumed which references the whole JSON value
 
-    @throw parse_error.107 if the given JSON pointer @a s is nonempty and
-    does not begin with a slash (`/`); see example below
+    @throw parse_error.107 if the given JSON pointer @a s is nonempty and does
+                           not begin with a slash (`/`); see example below
 
-    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s
-    is not followed by `0` (representing `~`) or `1` (representing `/`);
-    see example below
+    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is
+    not followed by `0` (representing `~`) or `1` (representing `/`); see
+    example below
 
-    @liveexample{The example shows the construction several valid JSON
-    pointers as well as the exceptional behavior.,json_pointer}
+    @liveexample{The example shows the construction several valid JSON pointers
+    as well as the exceptional behavior.,json_pointer}
 
     @since version 2.0.0
     */
-    explicit json_pointer(const std::string& s = "") : reference_tokens(split(s)) {}
+    explicit json_pointer(const std::string& s = "")
+        : reference_tokens(split(s))
+    {}
 
     /*!
     @brief return a string representation of the JSON pointer
@@ -6988,6 +10483,27 @@ class json_pointer
         return to_string();
     }
 
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    */
+    static int array_index(const std::string& s)
+    {
+        std::size_t processed_chars = 0;
+        const int res = std::stoi(s, &processed_chars);
+
+        // check if the string was completely read
+        if (JSON_UNLIKELY(processed_chars != s.size()))
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'"));
+        }
+
+        return res;
+    }
+
   private:
     /*!
     @brief remove and return last reference pointer
@@ -7023,7 +10539,6 @@ class json_pointer
         return result;
     }
 
-
     /*!
     @brief create and return a reference to the pointed to value
 
@@ -7032,8 +10547,66 @@ class json_pointer
     @throw parse_error.109 if array index is not a number
     @throw type_error.313 if value cannot be unflattened
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    NLOHMANN_BASIC_JSON_TPL& get_and_create(NLOHMANN_BASIC_JSON_TPL& j) const;
+    BasicJsonType& get_and_create(BasicJsonType& j) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        auto result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (result->m_type)
+            {
+                case detail::value_t::null:
+                {
+                    if (reference_token == "0")
+                    {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    }
+                    else
+                    {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object:
+                {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // create an entry in the array
+                    JSON_TRY
+                    {
+                        result = &result->operator[](static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten"));
+            }
+        }
+
+        return *result;
+    }
 
     /*!
     @brief return a reference to the pointed to value
@@ -7054,8 +10627,75 @@ class json_pointer
     @throw parse_error.109   if an array index was not a number
     @throw out_of_range.404  if the JSON pointer can not be resolved
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    NLOHMANN_BASIC_JSON_TPL& get_unchecked(NLOHMANN_BASIC_JSON_TPL* ptr) const;
+    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            // convert null values to arrays or objects before continuing
+            if (ptr->m_type == detail::value_t::null)
+            {
+                // check if reference token is a number
+                const bool nums =
+                    std::all_of(reference_token.begin(), reference_token.end(),
+                                [](const char x)
+                {
+                    return (x >= '0' and x <= '9');
+                });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums or reference_token == "-")
+                       ? detail::value_t::array
+                       : detail::value_t::object;
+            }
+
+            switch (ptr->m_type)
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    if (reference_token == "-")
+                    {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_value.array->size());
+                    }
+                    else
+                    {
+                        // convert array index to number; unchecked access
+                        JSON_TRY
+                        {
+                            ptr = &ptr->operator[](
+                                static_cast<size_type>(array_index(reference_token)));
+                        }
+                        JSON_CATCH(std::invalid_argument&)
+                        {
+                            JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                        }
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
 
     /*!
     @throw parse_error.106   if an array index begins with '0'
@@ -7063,8 +10703,57 @@ class json_pointer
     @throw out_of_range.402  if the array index '-' is used
     @throw out_of_range.404  if the JSON pointer can not be resolved
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    NLOHMANN_BASIC_JSON_TPL& get_checked(NLOHMANN_BASIC_JSON_TPL* ptr) const;
+    BasicJsonType& get_checked(BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->m_type)
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    // note: at performs range check
+                    JSON_TRY
+                    {
+                        ptr = &ptr->at(static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
 
     /*!
     @brief return a const reference to the pointed to value
@@ -7079,8 +10768,58 @@ class json_pointer
     @throw out_of_range.402  if the array index '-' is used
     @throw out_of_range.404  if the JSON pointer can not be resolved
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    const NLOHMANN_BASIC_JSON_TPL& get_unchecked(const NLOHMANN_BASIC_JSON_TPL* ptr) const;
+    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->m_type)
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    // use unchecked array access
+                    JSON_TRY
+                    {
+                        ptr = &ptr->operator[](
+                            static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
 
     /*!
     @throw parse_error.106   if an array index begins with '0'
@@ -7088,8 +10827,57 @@ class json_pointer
     @throw out_of_range.402  if the array index '-' is used
     @throw out_of_range.404  if the JSON pointer can not be resolved
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    const NLOHMANN_BASIC_JSON_TPL& get_checked(const NLOHMANN_BASIC_JSON_TPL* ptr) const;
+    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
+    {
+        using size_type = typename BasicJsonType::size_type;
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->m_type)
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // error condition (cf. RFC 6901, Sect. 4)
+                    if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
+                    {
+                        JSON_THROW(detail::parse_error::create(106, 0,
+                                                               "array index '" + reference_token +
+                                                               "' must not begin with '0'"));
+                    }
+
+                    // note: at performs range check
+                    JSON_TRY
+                    {
+                        ptr = &ptr->at(static_cast<size_type>(array_index(reference_token)));
+                    }
+                    JSON_CATCH(std::invalid_argument&)
+                    {
+                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
 
     /*!
     @brief split the string input to reference tokens
@@ -7208,10 +10996,57 @@ class json_pointer
 
     @note Empty objects or arrays are flattened to `null`.
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
     static void flatten(const std::string& reference_string,
-                        const NLOHMANN_BASIC_JSON_TPL& value,
-                        NLOHMANN_BASIC_JSON_TPL& result);
+                        const BasicJsonType& value,
+                        BasicJsonType& result)
+    {
+        switch (value.m_type)
+        {
+            case detail::value_t::array:
+            {
+                if (value.m_value.array->empty())
+                {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
+                    {
+                        flatten(reference_string + "/" + std::to_string(i),
+                                value.m_value.array->operator[](i), result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object:
+            {
+                if (value.m_value.object->empty())
+                {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_value.object)
+                    {
+                        flatten(reference_string + "/" + escape(element.first), element.second, result);
+                    }
+                }
+                break;
+            }
+
+            default:
+            {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
 
     /*!
     @param[in] value  flattened JSON
@@ -7223,19 +11058,112 @@ class json_pointer
     @throw type_error.315  if object values are not primitive
     @throw type_error.313  if value cannot be unflattened
     */
-    NLOHMANN_BASIC_JSON_TPL_DECLARATION
-    static NLOHMANN_BASIC_JSON_TPL
-    unflatten(const NLOHMANN_BASIC_JSON_TPL& value);
+    static BasicJsonType
+    unflatten(const BasicJsonType& value)
+    {
+        if (JSON_UNLIKELY(not value.is_object()))
+        {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened"));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_value.object)
+        {
+            if (JSON_UNLIKELY(not element.second.is_primitive()))
+            {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive"));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
 
     friend bool operator==(json_pointer const& lhs,
-                           json_pointer const& rhs) noexcept;
+                           json_pointer const& rhs) noexcept
+    {
+        return (lhs.reference_tokens == rhs.reference_tokens);
+    }
 
     friend bool operator!=(json_pointer const& lhs,
-                           json_pointer const& rhs) noexcept;
+                           json_pointer const& rhs) noexcept
+    {
+        return not (lhs == rhs);
+    }
 
     /// the reference tokens
     std::vector<std::string> reference_tokens;
 };
+}
+
+// #include <nlohmann/adl_serializer.hpp>
+
+
+#include <utility>
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+
+namespace nlohmann
+{
+template<typename, typename>
+struct adl_serializer
+{
+    /*!
+    @brief convert a JSON value to any value type
+
+    This function is usually called by the `get()` function of the
+    @ref basic_json class (either explicit or via conversion operators).
+
+    @param[in] j         JSON value to read from
+    @param[in,out] val  value to write to
+    */
+    template<typename BasicJsonType, typename ValueType>
+    static auto from_json(BasicJsonType&& j, ValueType& val) noexcept(
+        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val))) -> decltype(
+            ::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void()
+        )
+    {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /*!
+    @brief convert any value type to a JSON value
+
+    This function is usually called by the constructors of the @ref basic_json
+    class.
+
+    @param[in,out] j  JSON value to write to
+    @param[in] val     value to read from
+    */
+    template <typename BasicJsonType, typename ValueType>
+    static auto to_json(BasicJsonType& j, ValueType&& val) noexcept(
+        noexcept(::nlohmann::to_json(j, std::forward<ValueType>(val))))
+    -> decltype(::nlohmann::to_json(j, std::forward<ValueType>(val)),
+                void())
+    {
+        ::nlohmann::to_json(j, std::forward<ValueType>(val));
+    }
+};
+}
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
 
 /*!
 @brief a class to store JSON values
@@ -7261,42 +11189,42 @@ and `from_json()` (@ref adl_serializer by default)
 
 @requirement The class satisfies the following concept requirements:
 - Basic
- - [DefaultConstructible](http://en.cppreference.com/w/cpp/concept/DefaultConstructible):
+ - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
    JSON values can be default constructed. The result will be a JSON null
    value.
- - [MoveConstructible](http://en.cppreference.com/w/cpp/concept/MoveConstructible):
+ - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
    A JSON value can be constructed from an rvalue argument.
- - [CopyConstructible](http://en.cppreference.com/w/cpp/concept/CopyConstructible):
+ - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
    A JSON value can be copy-constructed from an lvalue expression.
- - [MoveAssignable](http://en.cppreference.com/w/cpp/concept/MoveAssignable):
+ - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
    A JSON value van be assigned from an rvalue argument.
- - [CopyAssignable](http://en.cppreference.com/w/cpp/concept/CopyAssignable):
+ - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
    A JSON value can be copy-assigned from an lvalue expression.
- - [Destructible](http://en.cppreference.com/w/cpp/concept/Destructible):
+ - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
    JSON values can be destructed.
 - Layout
- - [StandardLayoutType](http://en.cppreference.com/w/cpp/concept/StandardLayoutType):
+ - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
    JSON values have
-   [standard layout](http://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
    All non-static data members are private and standard layout types, the
    class has no virtual functions or (virtual) base classes.
 - Library-wide
- - [EqualityComparable](http://en.cppreference.com/w/cpp/concept/EqualityComparable):
+ - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
    JSON values can be compared with `==`, see @ref
    operator==(const_reference,const_reference).
- - [LessThanComparable](http://en.cppreference.com/w/cpp/concept/LessThanComparable):
+ - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
    JSON values can be compared with `<`, see @ref
    operator<(const_reference,const_reference).
- - [Swappable](http://en.cppreference.com/w/cpp/concept/Swappable):
+ - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
    Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
    other compatible types, using unqualified function call @ref swap().
- - [NullablePointer](http://en.cppreference.com/w/cpp/concept/NullablePointer):
+ - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
    JSON values can be compared against `std::nullptr_t` objects which are used
    to model the `null` value.
 - Container
- - [Container](http://en.cppreference.com/w/cpp/concept/Container):
+ - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
    JSON values can be used like STL containers and provide iterator access.
- - [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer);
+ - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
    JSON values can be used like STL containers and provide reverse iterator
    access.
 
@@ -7323,15 +11251,19 @@ class basic_json
 {
   private:
     template<detail::value_t> friend struct detail::external_constructor;
-    friend ::nlohmann::json_pointer;
+    friend ::nlohmann::json_pointer<basic_json>;
     friend ::nlohmann::detail::parser<basic_json>;
     friend ::nlohmann::detail::serializer<basic_json>;
     template<typename BasicJsonType>
     friend class ::nlohmann::detail::iter_impl;
     template<typename BasicJsonType, typename CharType>
     friend class ::nlohmann::detail::binary_writer;
-    template<typename BasicJsonType>
+    template<typename BasicJsonType, typename SAX>
     friend class ::nlohmann::detail::binary_reader;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
 
     /// workaround type for MSVC
     using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
@@ -7359,13 +11291,17 @@ class basic_json
 
   public:
     using value_t = detail::value_t;
-    // forward declarations
-    using json_pointer = ::nlohmann::json_pointer;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<basic_json>;
     template<typename T, typename SFINAE>
     using json_serializer = JSONSerializer<T, SFINAE>;
-
+    /// helper type for initializer lists of basic_json values
     using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
 
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
     ////////////////
     // exceptions //
     ////////////////
@@ -7473,10 +11409,13 @@ class basic_json
         result["copyright"] = "(C) 2013-2017 Niels Lohmann";
         result["name"] = "JSON for Modern C++";
         result["url"] = "https://github.com/nlohmann/json";
-        result["version"] =
-        {
-            {"string", "2.1.1"}, {"major", 2}, {"minor", 1}, {"patch", 1}
-        };
+        result["version"]["string"] =
+            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
+        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
 
 #ifdef _WIN32
         result["platform"] = "win32";
@@ -7528,6 +11467,14 @@ class basic_json
     /// the template arguments passed to class @ref basic_json.
     /// @{
 
+#if defined(JSON_HAS_CPP_14)
+    // Use transparent comparator if possible, combined with perfect forwarding
+    // on find() and count() calls prevents unnecessary string construction.
+    using object_comparator_t = std::less<>;
+#else
+    using object_comparator_t = std::less<StringType>;
+#endif
+
     /*!
     @brief a type for an object
 
@@ -7570,10 +11517,10 @@ class basic_json
     - When all names are unique, objects will be interoperable in the sense
       that all software implementations receiving that object will agree on
       the name-value mappings.
-    - When the names within an object are not unique, later stored name/value
-      pairs overwrite previously stored name/value pairs, leaving the used
-      names unique. For instance, `{"key": 1}` and `{"key": 2, "key": 1}` will
-      be treated as equal and both stored as `{"key": 1}`.
+    - When the names within an object are not unique, it is unspecified which
+      one of the values for a given key will be chosen. For instance,
+      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
+      `{"key": 2}`.
     - Internally, name/value pairs are stored in lexicographical order of the
       names. Objects will also be serialized (see @ref dump) in this order.
       For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
@@ -7613,7 +11560,7 @@ class basic_json
     */
     using object_t = ObjectType<StringType,
           basic_json,
-          std::less<StringType>,
+          object_comparator_t,
           AllocatorType<std::pair<const StringType,
           basic_json>>>;
 
@@ -7962,12 +11909,14 @@ class basic_json
     static T* create(Args&& ... args)
     {
         AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
         auto deleter = [&](T * object)
         {
-            alloc.deallocate(object, 1);
+            AllocatorTraits::deallocate(alloc, object, 1);
         };
-        std::unique_ptr<T, decltype(deleter)> object(alloc.allocate(1), deleter);
-        alloc.construct(object.get(), std::forward<Args>(args)...);
+        std::unique_ptr<T, decltype(deleter)> object(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, object.get(), std::forward<Args>(args)...);
         assert(object != nullptr);
         return object.release();
     }
@@ -8076,14 +12025,16 @@ class basic_json
 
                 case value_t::null:
                 {
+                    object = nullptr;  // silence warning, see #821
                     break;
                 }
 
                 default:
                 {
+                    object = nullptr;  // silence warning, see #821
                     if (JSON_UNLIKELY(t == value_t::null))
                     {
-                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 2.1.1")); // LCOV_EXCL_LINE
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.2.0")); // LCOV_EXCL_LINE
                     }
                     break;
                 }
@@ -8126,31 +12077,31 @@ class basic_json
             array = create<array_t>(std::move(value));
         }
 
-        void destroy(value_t t)
+        void destroy(value_t t) noexcept
         {
             switch (t)
             {
                 case value_t::object:
                 {
                     AllocatorType<object_t> alloc;
-                    alloc.destroy(object);
-                    alloc.deallocate(object, 1);
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
                     break;
                 }
 
                 case value_t::array:
                 {
                     AllocatorType<array_t> alloc;
-                    alloc.destroy(array);
-                    alloc.deallocate(array, 1);
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
                     break;
                 }
 
                 case value_t::string:
                 {
                     AllocatorType<string_t> alloc;
-                    alloc.destroy(string);
-                    alloc.deallocate(string, 1);
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
                     break;
                 }
 
@@ -8171,7 +12122,7 @@ class basic_json
     value is changed, because the invariant expresses a relationship between
     @a m_type and @a m_value.
     */
-    void assert_invariant() const
+    void assert_invariant() const noexcept
     {
         assert(m_type != value_t::object or m_value.object != nullptr);
         assert(m_type != value_t::array or m_value.array != nullptr);
@@ -8183,19 +12134,32 @@ class basic_json
     // JSON parser callback //
     //////////////////////////
 
+    /*!
+    @brief parser event types
+
+    The parser callback distinguishes the following events:
+    - `object_start`: the parser read `{` and started to process a JSON object
+    - `key`: the parser read a key of a value in an object
+    - `object_end`: the parser read `}` and finished processing a JSON object
+    - `array_start`: the parser read `[` and started to process a JSON array
+    - `array_end`: the parser read `]` and finished processing a JSON array
+    - `value`: the parser finished reading a JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    @sa @ref parser_callback_t for more information and examples
+    */
     using parse_event_t = typename parser::parse_event_t;
 
     /*!
     @brief per-element parser callback type
 
     With a parser callback function, the result of parsing a JSON text can be
-    influenced. When passed to @ref parse(std::istream&, const
-    parser_callback_t) or @ref parse(const CharT, const parser_callback_t),
-    it is called on certain events (passed as @ref parse_event_t via parameter
-    @a event) with a set recursion depth @a depth and context JSON value
-    @a parsed. The return value of the callback function is a boolean
-    indicating whether the element that emitted the callback shall be kept or
-    not.
+    influenced. When passed to @ref parse, it is called on certain events
+    (passed as @ref parse_event_t via parameter @a event) with a set recursion
+    depth @a depth and context JSON value @a parsed. The return value of the
+    callback function is a boolean indicating whether the element that emitted
+    the callback shall be kept or not.
 
     We distinguish six scenarios (determined by the event type) in which the
     callback function can be called. The following table describes the values
@@ -8232,14 +12196,12 @@ class basic_json
     should be kept (`true`) or not (`false`). In the latter case, it is either
     skipped completely or replaced by an empty discarded object.
 
-    @sa @ref parse(std::istream&, parser_callback_t) or
-    @ref parse(const CharT, const parser_callback_t) for examples
+    @sa @ref parse for examples
 
     @since version 1.0.0
     */
     using parser_callback_t = typename parser::parser_callback_t;
 
-
     //////////////////
     // constructors //
     //////////////////
@@ -8312,7 +12274,7 @@ class basic_json
     @brief create a JSON value
 
     This is a "catch all" constructor for all compatible JSON types; that is,
-    types for which a `to_json()` method exsits. The constructor forwards the
+    types for which a `to_json()` method exists. The constructor forwards the
     parameter @a val to that method (to `json_serializer<U>::to_json` method
     with `U = uncvref_t<CompatibleType>`, to be exact).
 
@@ -8341,6 +12303,7 @@ class basic_json
     - @a CompatibleType is not derived from `std::istream`,
     - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
          constructors),
+    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
     - @a CompatibleType is not a @ref basic_json nested type (e.g.,
          @ref json_pointer, @ref iterator, etc ...)
     - @ref @ref json_serializer<U> has a
@@ -8364,20 +12327,90 @@ class basic_json
 
     @since version 2.1.0
     */
-    template<typename CompatibleType, typename U = detail::uncvref_t<CompatibleType>,
-             detail::enable_if_t<not std::is_base_of<std::istream, U>::value and
-                                 not std::is_same<U, basic_json_t>::value and
-                                 not detail::is_basic_json_nested_type<
-                                     basic_json_t, U>::value and
-                                 detail::has_to_json<basic_json, U>::value,
-                                 int> = 0>
-    basic_json(CompatibleType && val) noexcept(noexcept(JSONSerializer<U>::to_json(
-                std::declval<basic_json_t&>(), std::forward<CompatibleType>(val))))
+    template <typename CompatibleType,
+              typename U = detail::uncvref_t<CompatibleType>,
+              detail::enable_if_t<
+                  not detail::is_basic_json<U>::value and detail::is_compatible_type<basic_json_t, U>::value, int> = 0>
+    basic_json(CompatibleType && val) noexcept(noexcept(
+                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                           std::forward<CompatibleType>(val))))
     {
         JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
         assert_invariant();
     }
 
+    /*!
+    @brief create a JSON value from an existing one
+
+    This is a constructor for existing @ref basic_json types.
+    It does not hijack copy/move constructors, since the parameter has different
+    template arguments than the current ones.
+
+    The constructor tries to convert the internal @ref m_value of the parameter.
+
+    @tparam BasicJsonType a type such that:
+    - @a BasicJsonType is a @ref basic_json type.
+    - @a BasicJsonType has different template arguments than @ref basic_json_t.
+
+    @param[in] val the @ref basic_json value to be converted.
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @since version 3.2.0
+    */
+    template <typename BasicJsonType,
+              detail::enable_if_t<
+                  detail::is_basic_json<BasicJsonType>::value and not std::is_same<basic_json, BasicJsonType>::value, int> = 0>
+    basic_json(const BasicJsonType& val)
+    {
+        using other_boolean_t = typename BasicJsonType::boolean_t;
+        using other_number_float_t = typename BasicJsonType::number_float_t;
+        using other_number_integer_t = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t = typename BasicJsonType::string_t;
+        using other_object_t = typename BasicJsonType::object_t;
+        using other_array_t = typename BasicJsonType::array_t;
+
+        switch (val.type())
+        {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_type = value_t::discarded;
+                break;
+        }
+        assert_invariant();
+    }
+
     /*!
     @brief create a container (array or object) from an initializer list
 
@@ -8650,7 +12683,7 @@ class basic_json
     @warning A precondition is enforced with a runtime assertion that will
              result in calling `std::abort` if this precondition is not met.
              Assertions can be disabled by defining `NDEBUG` at compile time.
-             See http://en.cppreference.com/w/cpp/error/assert for more
+             See https://en.cppreference.com/w/cpp/error/assert for more
              information.
 
     @throw invalid_iterator.201 if iterators @a first and @a last are not
@@ -8790,7 +12823,7 @@ class basic_json
     changes to any JSON value.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is linear.
     - As postcondition, it holds: `other == basic_json(other)`.
@@ -8875,7 +12908,7 @@ class basic_json
     exceptions.
 
     @requirement This function helps `basic_json` satisfying the
-    [MoveConstructible](http://en.cppreference.com/w/cpp/concept/MoveConstructible)
+    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
     requirements.
 
     @liveexample{The code below shows the move constructor explicitly called
@@ -8909,7 +12942,7 @@ class basic_json
     @complexity Linear.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is linear.
 
@@ -8946,14 +12979,14 @@ class basic_json
     @complexity Linear.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is linear.
     - All stored elements are destroyed and all memory is freed.
 
     @since version 1.0.0
     */
-    ~basic_json()
+    ~basic_json() noexcept
     {
         assert_invariant();
         m_value.destroy(m_type);
@@ -8984,11 +13017,14 @@ class basic_json
     @param[in] indent_char The character to use for indentation if @a indent is
     greater than `0`. The default is ` ` (space).
     @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
-    in the output are escaped with \uXXXX sequences, and the result consists
+    in the output are escaped with `\uXXXX` sequences, and the result consists
     of ASCII characters only.
 
     @return string containing the serialization of the JSON value
 
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
     @complexity Linear.
 
     @exceptionsafety Strong guarantee: if an exception is thrown, there are no
@@ -9000,14 +13036,14 @@ class basic_json
 
     @see https://docs.python.org/2/library/json.html#json.dump
 
-    @since version 1.0.0; indentation character @a indent_char and option
-           @a ensure_ascii added in version 3.0.0
+    @since version 1.0.0; indentation character @a indent_char, option
+           @a ensure_ascii and exceptions added in version 3.0.0
     */
     string_t dump(const int indent = -1, const char indent_char = ' ',
                   const bool ensure_ascii = false) const
     {
         string_t result;
-        serializer s(detail::output_adapter<char>(result), indent_char);
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char);
 
         if (indent >= 0)
         {
@@ -9035,7 +13071,7 @@ class basic_json
             string                    | value_t::string
             number (integer)          | value_t::number_integer
             number (unsigned integer) | value_t::number_unsigned
-            number (foating-point)    | value_t::number_float
+            number (floating-point)   | value_t::number_float
             object                    | value_t::object
             array                     | value_t::array
             discarded                 | value_t::discarded
@@ -9048,7 +13084,7 @@ class basic_json
     @liveexample{The following code exemplifies `type()` for all JSON
     types.,type}
 
-    @sa @ref operator value_t() -- return the type of the JSON value (implicit) 
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
     @sa @ref type_name() -- return the type as string
 
     @since version 1.0.0
@@ -9539,22 +13575,43 @@ class basic_json
 
     @since version 2.1.0
     */
-    template <
-        typename BasicJsonType,
-        detail::enable_if_t<std::is_same<typename std::remove_const<BasicJsonType>::type,
-                                         basic_json_t>::value,
-                            int> = 0 >
+    template<typename BasicJsonType, detail::enable_if_t<
+                 std::is_same<typename std::remove_const<BasicJsonType>::type, basic_json_t>::value,
+                 int> = 0>
     basic_json get() const
     {
         return *this;
     }
 
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @tparam BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template<typename BasicJsonType, detail::enable_if_t<
+                 not std::is_same<BasicJsonType, basic_json>::value and
+                 detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    BasicJsonType get() const
+    {
+        return *this;
+    }
+
     /*!
     @brief get a value (explicit)
 
     Explicit type conversion between the JSON value and a compatible value
-    which is [CopyConstructible](http://en.cppreference.com/w/cpp/concept/CopyConstructible)
-    and [DefaultConstructible](http://en.cppreference.com/w/cpp/concept/DefaultConstructible).
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
     The value is converted by calling the @ref json_serializer<ValueType>
     `from_json()` method.
 
@@ -9588,14 +13645,12 @@ class basic_json
 
     @since version 2.1.0
     */
-    template <
-        typename ValueTypeCV,
-        typename ValueType = detail::uncvref_t<ValueTypeCV>,
-        detail::enable_if_t <
-            not std::is_same<basic_json_t, ValueType>::value and
-            detail::has_from_json<basic_json_t, ValueType>::value and
-            not detail::has_non_default_from_json<basic_json_t, ValueType>::value,
-            int > = 0 >
+    template<typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+             detail::enable_if_t <
+                 not detail::is_basic_json<ValueType>::value and
+                 detail::has_from_json<basic_json_t, ValueType>::value and
+                 not detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                 int> = 0>
     ValueType get() const noexcept(noexcept(
                                        JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
     {
@@ -9616,8 +13671,8 @@ class basic_json
     @brief get a value (explicit); special case
 
     Explicit type conversion between the JSON value and a compatible value
-    which is **not** [CopyConstructible](http://en.cppreference.com/w/cpp/concept/CopyConstructible)
-    and **not** [DefaultConstructible](http://en.cppreference.com/w/cpp/concept/DefaultConstructible).
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
     The value is converted by calling the @ref json_serializer<ValueType>
     `from_json()` method.
 
@@ -9643,12 +13698,10 @@ class basic_json
 
     @since version 2.1.0
     */
-    template <
-        typename ValueTypeCV,
-        typename ValueType = detail::uncvref_t<ValueTypeCV>,
-        detail::enable_if_t<not std::is_same<basic_json_t, ValueType>::value and
-                            detail::has_non_default_from_json<basic_json_t,
-                                    ValueType>::value, int> = 0 >
+    template<typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+             detail::enable_if_t<not std::is_same<basic_json_t, ValueType>::value and
+                                 detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                                 int> = 0>
     ValueType get() const noexcept(noexcept(
                                        JSONSerializer<ValueTypeCV>::from_json(std::declval<const basic_json_t&>())))
     {
@@ -9657,6 +13710,99 @@ class basic_json
         return JSONSerializer<ValueTypeCV>::from_json(*this);
     }
 
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType v;
+    JSONSerializer<ValueType>::from_json(*this, v);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+
+    @tparam ValueType the input parameter type.
+
+    @return the input parameter, allowing chaining calls.
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get_to}
+
+    @since version 3.3.0
+    */
+    template<typename ValueType,
+             detail::enable_if_t <
+                 not detail::is_basic_json<ValueType>::value and
+                 detail::has_from_json<basic_json_t, ValueType>::value,
+                 int> = 0>
+    ValueType & get_to(ValueType& v) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
+    assertion.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value and
+                 std::is_const<typename std::remove_pointer<PointerType>::type>::value, int>::type = 0>
+    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
     /*!
     @brief get a pointer value (explicit)
 
@@ -9686,7 +13832,7 @@ class basic_json
     */
     template<typename PointerType, typename std::enable_if<
                  std::is_pointer<PointerType>::value, int>::type = 0>
-    PointerType get() noexcept
+    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
     {
         // delegate the call to get_ptr
         return get_ptr<PointerType>();
@@ -9698,89 +13844,12 @@ class basic_json
     */
     template<typename PointerType, typename std::enable_if<
                  std::is_pointer<PointerType>::value, int>::type = 0>
-    constexpr const PointerType get() const noexcept
+    constexpr auto get() const noexcept -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
     {
         // delegate the call to get_ptr
         return get_ptr<PointerType>();
     }
 
-    /*!
-    @brief get a pointer value (implicit)
-
-    Implicit pointer access to the internally stored JSON value. No copies are
-    made.
-
-    @warning Writing data to the pointee of the result yields an undefined
-    state.
-
-    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
-    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
-    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
-    assertion.
-
-    @return pointer to the internally stored JSON value if the requested
-    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how pointers to internal values of a
-    JSON value can be requested. Note that no type conversions are made and a
-    `nullptr` is returned if the value and the requested pointer type does not
-    match.,get_ptr}
-
-    @since version 1.0.0
-    */
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value, int>::type = 0>
-    PointerType get_ptr() noexcept
-    {
-        // get the type of the PointerType (remove pointer and const)
-        using pointee_t = typename std::remove_const<typename
-                          std::remove_pointer<typename
-                          std::remove_const<PointerType>::type>::type>::type;
-        // make sure the type matches the allowed types
-        static_assert(
-            std::is_same<object_t, pointee_t>::value
-            or std::is_same<array_t, pointee_t>::value
-            or std::is_same<string_t, pointee_t>::value
-            or std::is_same<boolean_t, pointee_t>::value
-            or std::is_same<number_integer_t, pointee_t>::value
-            or std::is_same<number_unsigned_t, pointee_t>::value
-            or std::is_same<number_float_t, pointee_t>::value
-            , "incompatible pointer type");
-
-        // delegate the call to get_impl_ptr<>()
-        return get_impl_ptr(static_cast<PointerType>(nullptr));
-    }
-
-    /*!
-    @brief get a pointer value (implicit)
-    @copydoc get_ptr()
-    */
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value and
-                 std::is_const<typename std::remove_pointer<PointerType>::type>::value, int>::type = 0>
-    constexpr const PointerType get_ptr() const noexcept
-    {
-        // get the type of the PointerType (remove pointer and const)
-        using pointee_t = typename std::remove_const<typename
-                          std::remove_pointer<typename
-                          std::remove_const<PointerType>::type>::type>::type;
-        // make sure the type matches the allowed types
-        static_assert(
-            std::is_same<object_t, pointee_t>::value
-            or std::is_same<array_t, pointee_t>::value
-            or std::is_same<string_t, pointee_t>::value
-            or std::is_same<boolean_t, pointee_t>::value
-            or std::is_same<number_integer_t, pointee_t>::value
-            or std::is_same<number_unsigned_t, pointee_t>::value
-            or std::is_same<number_float_t, pointee_t>::value
-            , "incompatible pointer type");
-
-        // delegate the call to get_impl_ptr<>() const
-        return get_impl_ptr(static_cast<const PointerType>(nullptr));
-    }
-
     /*!
     @brief get a reference value (implicit)
 
@@ -9860,13 +13929,16 @@ class basic_json
     template < typename ValueType, typename std::enable_if <
                    not std::is_pointer<ValueType>::value and
                    not std::is_same<ValueType, detail::json_ref<basic_json>>::value and
-                   not std::is_same<ValueType, typename string_t::value_type>::value
+                   not std::is_same<ValueType, typename string_t::value_type>::value and
+                   not detail::is_basic_json<ValueType>::value
+
 #ifndef _MSC_VER  // fix for issue #167 operator<< ambiguity under VS2015
                    and not std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>::value
-#endif
-#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_MSC_VER) && _MSC_VER >1900 && defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+#if defined(JSON_HAS_CPP_17) && defined(_MSC_VER) and _MSC_VER <= 1914
                    and not std::is_same<ValueType, typename std::string_view>::value
 #endif
+#endif
+                   and detail::is_detected<detail::get_template_function, const basic_json_t&, ValueType>::value
                    , int >::type = 0 >
     operator ValueType() const
     {
@@ -10130,7 +14202,7 @@ class basic_json
             return m_value.array->operator[](idx);
         }
 
-        JSON_THROW(type_error::create(305, "cannot use operator[] with " + std::string(type_name())));
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
     }
 
     /*!
@@ -10142,7 +14214,7 @@ class basic_json
 
     @return const reference to the element at index @a idx
 
-    @throw type_error.305 if the JSON value is not an array; in that cases,
+    @throw type_error.305 if the JSON value is not an array; in that case,
     using the [] operator with an index makes no sense.
 
     @complexity Constant.
@@ -10160,7 +14232,7 @@ class basic_json
             return m_value.array->operator[](idx);
         }
 
-        JSON_THROW(type_error::create(305, "cannot use operator[] with " + std::string(type_name())));
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
     }
 
     /*!
@@ -10206,7 +14278,7 @@ class basic_json
             return m_value.object->operator[](key);
         }
 
-        JSON_THROW(type_error::create(305, "cannot use operator[] with " + std::string(type_name())));
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
     }
 
     /*!
@@ -10225,7 +14297,7 @@ class basic_json
     @pre The element with key @a key must exist. **This precondition is
          enforced with an assertion.**
 
-    @throw type_error.305 if the JSON value is not an object; in that cases,
+    @throw type_error.305 if the JSON value is not an object; in that case,
     using the [] operator with a key makes no sense.
 
     @complexity Logarithmic in the size of the container.
@@ -10248,7 +14320,7 @@ class basic_json
             return m_value.object->find(key)->second;
         }
 
-        JSON_THROW(type_error::create(305, "cannot use operator[] with " + std::string(type_name())));
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
     }
 
     /*!
@@ -10295,7 +14367,7 @@ class basic_json
             return m_value.object->operator[](key);
         }
 
-        JSON_THROW(type_error::create(305, "cannot use operator[] with " + std::string(type_name())));
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
     }
 
     /*!
@@ -10314,7 +14386,7 @@ class basic_json
     @pre The element with key @a key must exist. **This precondition is
          enforced with an assertion.**
 
-    @throw type_error.305 if the JSON value is not an object; in that cases,
+    @throw type_error.305 if the JSON value is not an object; in that case,
     using the [] operator with a key makes no sense.
 
     @complexity Logarithmic in the size of the container.
@@ -10338,7 +14410,7 @@ class basic_json
             return m_value.object->find(key)->second;
         }
 
-        JSON_THROW(type_error::create(305, "cannot use operator[] with " + std::string(type_name())));
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
     }
 
     /*!
@@ -10374,7 +14446,7 @@ class basic_json
     @return copy of the element at key @a key or @a default_value if @a key
     is not found
 
-    @throw type_error.306 if the JSON value is not an objec; in that cases,
+    @throw type_error.306 if the JSON value is not an object; in that case,
     using `value()` with a key makes no sense.
 
     @complexity Logarithmic in the size of the container.
@@ -10447,7 +14519,7 @@ class basic_json
     @return copy of the element at key @a key or @a default_value if @a key
     is not found
 
-    @throw type_error.306 if the JSON value is not an objec; in that cases,
+    @throw type_error.306 if the JSON value is not an object; in that case,
     using `value()` with a key makes no sense.
 
     @complexity Logarithmic in the size of the container.
@@ -10471,7 +14543,7 @@ class basic_json
             {
                 return ptr.get_checked(this);
             }
-            JSON_CATCH (out_of_range&)
+            JSON_INTERNAL_CATCH (out_of_range&)
             {
                 return default_value;
             }
@@ -10651,8 +14723,8 @@ class basic_json
                 if (is_string())
                 {
                     AllocatorType<string_t> alloc;
-                    alloc.destroy(m_value.string);
-                    alloc.deallocate(m_value.string, 1);
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
                     m_value.string = nullptr;
                 }
 
@@ -10757,8 +14829,8 @@ class basic_json
                 if (is_string())
                 {
                     AllocatorType<string_t> alloc;
-                    alloc.destroy(m_value.string);
-                    alloc.deallocate(m_value.string, 1);
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
                     m_value.string = nullptr;
                 }
 
@@ -10890,7 +14962,7 @@ class basic_json
     @note This method always returns @ref end() when executed on a JSON type
           that is not an object.
 
-    @param[in] key key value of the element to search for
+    @param[in] key key value of the element to search for.
 
     @return Iterator to an element with key equivalent to @a key. If no such
     element is found or the JSON value is not an object, past-the-end (see
@@ -10902,13 +14974,14 @@ class basic_json
 
     @since version 1.0.0
     */
-    iterator find(typename object_t::key_type key)
+    template<typename KeyT>
+    iterator find(KeyT&& key)
     {
         auto result = end();
 
         if (is_object())
         {
-            result.m_it.object_iterator = m_value.object->find(key);
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
         }
 
         return result;
@@ -10916,15 +14989,16 @@ class basic_json
 
     /*!
     @brief find an element in a JSON object
-    @copydoc find(typename object_t::key_type)
+    @copydoc find(KeyT&&)
     */
-    const_iterator find(typename object_t::key_type key) const
+    template<typename KeyT>
+    const_iterator find(KeyT&& key) const
     {
         auto result = cend();
 
         if (is_object())
         {
-            result.m_it.object_iterator = m_value.object->find(key);
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
         }
 
         return result;
@@ -10951,10 +15025,11 @@ class basic_json
 
     @since version 1.0.0
     */
-    size_type count(typename object_t::key_type key) const
+    template<typename KeyT>
+    size_type count(KeyT&& key) const
     {
         // return 0 for all nonobject types
-        return is_object() ? m_value.object->count(key) : 0;
+        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
     }
 
     /// @}
@@ -10979,7 +15054,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
 
@@ -11018,7 +15093,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
     - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
@@ -11050,7 +15125,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
 
@@ -11089,7 +15164,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
     - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
@@ -11119,7 +15194,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer)
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
     requirements:
     - The complexity is constant.
     - Has the semantics of `reverse_iterator(end())`.
@@ -11156,7 +15231,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer)
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
     requirements:
     - The complexity is constant.
     - Has the semantics of `reverse_iterator(begin())`.
@@ -11193,7 +15268,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer)
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
     requirements:
     - The complexity is constant.
     - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
@@ -11222,7 +15297,7 @@ class basic_json
     @complexity Constant.
 
     @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer)
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
     requirements:
     - The complexity is constant.
     - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
@@ -11249,22 +15324,133 @@ class basic_json
     reference to the JSON values is returned, so there is no access to the
     underlying iterator.
 
+    For loop without iterator_wrapper:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without iterator proxy:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with iterator proxy:
+
+    @code{cpp}
+    for (auto it : json::iterator_wrapper(j_object))
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example).
+
+    @param[in] ref  reference to a JSON value
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
     @liveexample{The following code shows how the wrapper is used,iterator_wrapper}
 
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
     @note The name of this function is not yet final and may change in the
     future.
+
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use @ref items() instead;
+                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
     */
-    static iteration_proxy<iterator> iterator_wrapper(reference cont)
+    JSON_DEPRECATED
+    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
     {
-        return iteration_proxy<iterator>(cont);
+        return ref.items();
     }
 
     /*!
     @copydoc iterator_wrapper(reference)
     */
-    static iteration_proxy<const_iterator> iterator_wrapper(const_reference cont)
+    JSON_DEPRECATED
+    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
     {
-        return iteration_proxy<const_iterator>(cont);
+        return ref.items();
+    }
+
+    /*!
+    @brief helper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without `items()` function:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without `items()` function:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with `items()` function:
+
+    @code{cpp}
+    for (auto it : j_object.items())
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example). For primitive types (e.g., numbers),
+          `key()` returns an empty string.
+
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the function is used.,items}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 3.1.0.
+    */
+    iteration_proxy<iterator> items() noexcept
+    {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /*!
+    @copydoc items()
+    */
+    iteration_proxy<const_iterator> items() const noexcept
+    {
+        return iteration_proxy<const_iterator>(*this);
     }
 
     /// @}
@@ -11309,7 +15495,7 @@ class basic_json
     false in the case of a string.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
     - Has the semantics of `begin() == end()`.
@@ -11380,7 +15566,7 @@ class basic_json
     the case of a string.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
     - Has the semantics of `std::distance(begin(), end())`.
@@ -11450,7 +15636,7 @@ class basic_json
     @exceptionsafety No-throw guarantee: this function never throws exceptions.
 
     @requirement This function helps `basic_json` satisfying the
-    [Container](http://en.cppreference.com/w/cpp/concept/Container)
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
     requirements:
     - The complexity is constant.
     - Has the semantics of returning `b.size()` where `b` is the largest
@@ -11862,6 +16048,26 @@ class basic_json
         return {it, res.second};
     }
 
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template<typename... Args>
+    iterator insert_iterator(const_iterator pos, Args&& ... args)
+    {
+        iterator result(this);
+        assert(m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
+        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        return result;
+    }
+
     /*!
     @brief inserts element
 
@@ -11896,9 +16102,7 @@ class basic_json
             }
 
             // insert to array and return iterator
-            iterator result(this);
-            result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, val);
-            return result;
+            return insert_iterator(pos, val);
         }
 
         JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
@@ -11949,9 +16153,7 @@ class basic_json
             }
 
             // insert to array and return iterator
-            iterator result(this);
-            result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
-            return result;
+            return insert_iterator(pos, cnt, val);
         }
 
         JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
@@ -12007,18 +16209,13 @@ class basic_json
             JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
         }
 
-        if (JSON_UNLIKELY(first.m_object == this or last.m_object == this))
+        if (JSON_UNLIKELY(first.m_object == this))
         {
             JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container"));
         }
 
         // insert to array and return iterator
-        iterator result(this);
-        result.m_it.array_iterator = m_value.array->insert(
-                                         pos.m_it.array_iterator,
-                                         first.m_it.array_iterator,
-                                         last.m_it.array_iterator);
-        return result;
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
     }
 
     /*!
@@ -12060,9 +16257,7 @@ class basic_json
         }
 
         // insert to array and return iterator
-        iterator result(this);
-        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, ilist.begin(), ilist.end());
-        return result;
+        return insert_iterator(pos, ilist.begin(), ilist.end());
     }
 
     /*!
@@ -12103,8 +16298,7 @@ class basic_json
         }
 
         // passed iterators must belong to objects
-        if (JSON_UNLIKELY(not first.m_object->is_object()
-                          or not last.m_object->is_object()))
+        if (JSON_UNLIKELY(not first.m_object->is_object()))
         {
             JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
         }
@@ -12150,7 +16344,7 @@ class basic_json
             JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name())));
         }
 
-        for (auto it = j.begin(); it != j.end(); ++it)
+        for (auto it = j.cbegin(); it != j.cend(); ++it)
         {
             m_value.object->operator[](it.key()) = it.value();
         }
@@ -12205,7 +16399,7 @@ class basic_json
 
         // passed iterators must belong to objects
         if (JSON_UNLIKELY(not first.m_object->is_object()
-                          or not first.m_object->is_object()))
+                          or not last.m_object->is_object()))
         {
             JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
         }
@@ -12371,7 +16565,7 @@ class basic_json
     [comparison function](https://github.com/mariokonrad/marnav/blob/master/src/marnav/math/floatingpoint.hpp#L34-#L39)
     could be used, for instance
     @code {.cpp}
-    template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
+    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
     inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
     {
         return std::abs(a - b) <= epsilon;
@@ -12799,7 +16993,7 @@ class basic_json
       `std::setw(4)` on @a o sets the indentation level to `4` and the
       serialization result is the same as calling `dump(4)`.
 
-    - The indentation characrer can be controlled with the member variable
+    - The indentation character can be controlled with the member variable
       `fill` of the output stream @a o. For instance, the manipulator
       `std::setfill('\\t')` sets indentation to use a tab character rather than
       the default space character.
@@ -12809,12 +17003,15 @@ class basic_json
 
     @return the stream @a o
 
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
     @complexity Linear.
 
     @liveexample{The example below shows the serialization with different
     parameters to `width` to adjust the indentation level.,operator_serialize}
 
-    @since version 1.0.0; indentaction character added in version 3.0.0
+    @since version 1.0.0; indentation character added in version 3.0.0
     */
     friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
     {
@@ -12833,10 +17030,11 @@ class basic_json
 
     /*!
     @brief serialize to stream
-    @deprecated This stream operator is deprecated and will be removed in a
-                future version of the library. Please use
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use
                 @ref operator<<(std::ostream&, const basic_json&)
                 instead; that is, replace calls like `j >> o;` with `o << j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
     */
     JSON_DEPRECATED
     friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
@@ -12916,7 +17114,7 @@ class basic_json
 
     @since version 2.0.3 (contiguous containers)
     */
-    static basic_json parse(detail::input_adapter i,
+    static basic_json parse(detail::input_adapter&& i,
                             const parser_callback_t cb = nullptr,
                             const bool allow_exceptions = true)
     {
@@ -12925,26 +17123,80 @@ class basic_json
         return result;
     }
 
+    static bool accept(detail::input_adapter&& i)
+    {
+        return parser(i).accept(true);
+    }
+
     /*!
-    @copydoc basic_json parse(detail::input_adapter, const parser_callback_t)
+    @brief generate SAX events
+
+    The SAX event lister must follow the interface of @ref json_sax.
+
+    This function reads from a compatible input. Examples are:
+    - an array of 1-byte values
+    - strings with character/literal type with size of 1 byte
+    - input streams
+    - container with contiguous storage of 1-byte values. Compatible container
+      types include `std::vector`, `std::string`, `std::array`,
+      `std::valarray`, and `std::initializer_list`. Furthermore, C-style
+      arrays can be used with `std::begin()`/`std::end()`. User-defined
+      containers can be used as long as they implement random-access iterators
+      and a contiguous storage.
+
+    @pre Each element of the container has a size of 1 byte. Violating this
+    precondition yields undefined behavior. **This precondition is enforced
+    with a static assertion.**
+
+    @pre The container storage is contiguous. Violating this precondition
+    yields undefined behavior. **This precondition is enforced with an
+    assertion.**
+    @pre Each element of the container has a size of 1 byte. Violating this
+    precondition yields undefined behavior. **This precondition is enforced
+    with a static assertion.**
+
+    @warning There is no way to enforce all preconditions at compile-time. If
+             the function is called with a noncompliant container and with
+             assertions switched off, the behavior is undefined and will most
+             likely yield segmentation violation.
+
+    @param[in] i  input to read from
+    @param[in,out] sax  SAX event listener
+    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
+    @param[in] strict  whether the input has to be consumed completely
+
+    @return return value of the last processed SAX event
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
+    a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `sax_parse()` function
+    reading from string and processing the events with a user-defined SAX
+    event consumer.,sax_parse}
+
+    @since version 3.2.0
     */
-    static basic_json parse(detail::input_adapter& i,
-                            const parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true)
+    template <typename SAX>
+    static bool sax_parse(detail::input_adapter&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true)
     {
-        basic_json result;
-        parser(i, cb, allow_exceptions).parse(true, result);
-        return result;
-    }
-
-    static bool accept(detail::input_adapter i)
-    {
-        return parser(i).accept(true);
-    }
-
-    static bool accept(detail::input_adapter& i)
-    {
-        return parser(i).accept(true);
+        assert(sax);
+        switch (format)
+        {
+            case input_format_t::json:
+                return parser(std::move(i)).sax_parse(sax, strict);
+            default:
+                return detail::binary_reader<basic_json, SAX>(std::move(i)).sax_parse(format, sax, strict);
+        }
     }
 
     /*!
@@ -12974,6 +17226,8 @@ class basic_json
     @param[in] cb  a parser callback function of type @ref parser_callback_t
     which is used to control the deserialization by filtering unwanted values
     (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
 
     @return result of the deserialization
 
@@ -13014,12 +17268,22 @@ class basic_json
         return parser(detail::input_adapter(first, last)).accept(true);
     }
 
+    template<class IteratorType, class SAX, typename std::enable_if<
+                 std::is_base_of<
+                     std::random_access_iterator_tag,
+                     typename std::iterator_traits<IteratorType>::iterator_category>::value, int>::type = 0>
+    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax)
+    {
+        return parser(detail::input_adapter(first, last)).sax_parse(sax);
+    }
+
     /*!
     @brief deserialize from stream
-    @deprecated This stream operator is deprecated and will be removed in a
-                future version of the library. Please use
+    @deprecated This stream operator is deprecated and will be removed in
+                version 4.0.0 of the library. Please use
                 @ref operator>>(std::istream&, basic_json&)
                 instead; that is, replace calls like `j << i;` with `i >> j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
     */
     JSON_DEPRECATED
     friend std::istream& operator<<(basic_json& j, std::istream& i)
@@ -13089,7 +17353,7 @@ class basic_json
     types.,type_name}
 
     @sa @ref type() -- return the type of the JSON value
-    @sa @ref operator value_t() -- return the type of the JSON value (implicit) 
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
 
     @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
     since 3.0.0
@@ -13150,40 +17414,40 @@ class basic_json
 
     JSON value type | value/range                                | CBOR type                          | first byte
     --------------- | ------------------------------------------ | ---------------------------------- | ---------------
-    null            | `null`                                     | Null                               | 0xf6
-    boolean         | `true`                                     | True                               | 0xf5
-    boolean         | `false`                                    | False                              | 0xf4
-    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3b
-    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3a
+    null            | `null`                                     | Null                               | 0xF6
+    boolean         | `true`                                     | True                               | 0xF5
+    boolean         | `false`                                    | False                              | 0xF4
+    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
+    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
     number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
     number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
     number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
     number_integer  | 0..23                                      | Integer                            | 0x00..0x17
     number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
     number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
-    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1a
-    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1b
+    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
     number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
     number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
     number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
-    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1a
-    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1b
-    number_float    | *any value*                                | Double-Precision Float             | 0xfb
+    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_float    | *any value*                                | Double-Precision Float             | 0xFB
     string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
     string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
     string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
-    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7a
-    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7b
+    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
+    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
     array           | *size*: 0..23                              | array                              | 0x80..0x97
     array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
     array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
-    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9a
-    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9b
-    object          | *size*: 0..23                              | map                                | 0xa0..0xb7
-    object          | *size*: 23..255                            | map (1 byte follow)                | 0xb8
-    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xb9
-    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xba
-    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xbb
+    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
+    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
+    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
+    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
+    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
+    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
+    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
 
     @note The mapping is **complete** in the sense that any JSON value type
           can be converted to a CBOR value.
@@ -13193,20 +17457,20 @@ class basic_json
           function which serializes NaN or Infinity to `null`.
 
     @note The following CBOR types are not used in the conversion:
-          - byte strings (0x40..0x5f)
-          - UTF-8 strings terminated by "break" (0x7f)
-          - arrays terminated by "break" (0x9f)
-          - maps terminated by "break" (0xbf)
-          - date/time (0xc0..0xc1)
-          - bignum (0xc2..0xc3)
-          - decimal fraction (0xc4)
-          - bigfloat (0xc5)
-          - tagged items (0xc6..0xd4, 0xd8..0xdb)
-          - expected conversions (0xd5..0xd7)
-          - simple values (0xe0..0xf3, 0xf8)
-          - undefined (0xf7)
-          - half and single-precision floats (0xf9-0xfa)
-          - break (0xff)
+          - byte strings (0x40..0x5F)
+          - UTF-8 strings terminated by "break" (0x7F)
+          - arrays terminated by "break" (0x9F)
+          - maps terminated by "break" (0xBF)
+          - date/time (0xC0..0xC1)
+          - bignum (0xC2..0xC3)
+          - decimal fraction (0xC4)
+          - bigfloat (0xC5)
+          - tagged items (0xC6..0xD4, 0xD8..0xDB)
+          - expected conversions (0xD5..0xD7)
+          - simple values (0xE0..0xF3, 0xF8)
+          - undefined (0xF7)
+          - half and single-precision floats (0xF9-0xFA)
+          - break (0xFF)
 
     @param[in] j  JSON value to serialize
     @return MessagePack serialization as byte vector
@@ -13217,9 +17481,11 @@ class basic_json
     vector in CBOR format.,to_cbor}
 
     @sa http://cbor.io
-    @sa @ref from_cbor(const std::vector<uint8_t>&, const size_t) for the
+    @sa @ref from_cbor(detail::input_adapter, const bool strict) for the
         analogous deserialization
     @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
 
     @since version 2.0.9
     */
@@ -13252,35 +17518,35 @@ class basic_json
 
     JSON value type | value/range                       | MessagePack type | first byte
     --------------- | --------------------------------- | ---------------- | ----------
-    null            | `null`                            | nil              | 0xc0
-    boolean         | `true`                            | true             | 0xc3
-    boolean         | `false`                           | false            | 0xc2
-    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xd3
-    number_integer  | -2147483648..-32769               | int32            | 0xd2
-    number_integer  | -32768..-129                      | int16            | 0xd1
-    number_integer  | -128..-33                         | int8             | 0xd0
-    number_integer  | -32..-1                           | negative fixint  | 0xe0..0xff
-    number_integer  | 0..127                            | positive fixint  | 0x00..0x7f
-    number_integer  | 128..255                          | uint 8           | 0xcc
-    number_integer  | 256..65535                        | uint 16          | 0xcd
-    number_integer  | 65536..4294967295                 | uint 32          | 0xce
-    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xcf
-    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7f
-    number_unsigned | 128..255                          | uint 8           | 0xcc
-    number_unsigned | 256..65535                        | uint 16          | 0xcd
-    number_unsigned | 65536..4294967295                 | uint 32          | 0xce
-    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xcf
-    number_float    | *any value*                       | float 64         | 0xcb
-    string          | *length*: 0..31                   | fixstr           | 0xa0..0xbf
-    string          | *length*: 32..255                 | str 8            | 0xd9
-    string          | *length*: 256..65535              | str 16           | 0xda
-    string          | *length*: 65536..4294967295       | str 32           | 0xdb
-    array           | *size*: 0..15                     | fixarray         | 0x90..0x9f
-    array           | *size*: 16..65535                 | array 16         | 0xdc
-    array           | *size*: 65536..4294967295         | array 32         | 0xdd
-    object          | *size*: 0..15                     | fix map          | 0x80..0x8f
-    object          | *size*: 16..65535                 | map 16           | 0xde
-    object          | *size*: 65536..4294967295         | map 32           | 0xdf
+    null            | `null`                            | nil              | 0xC0
+    boolean         | `true`                            | true             | 0xC3
+    boolean         | `false`                           | false            | 0xC2
+    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
+    number_integer  | -2147483648..-32769               | int32            | 0xD2
+    number_integer  | -32768..-129                      | int16            | 0xD1
+    number_integer  | -128..-33                         | int8             | 0xD0
+    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
+    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
+    number_integer  | 128..255                          | uint 8           | 0xCC
+    number_integer  | 256..65535                        | uint 16          | 0xCD
+    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
+    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
+    number_unsigned | 128..255                          | uint 8           | 0xCC
+    number_unsigned | 256..65535                        | uint 16          | 0xCD
+    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
+    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_float    | *any value*                       | float 64         | 0xCB
+    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
+    string          | *length*: 32..255                 | str 8            | 0xD9
+    string          | *length*: 256..65535              | str 16           | 0xDA
+    string          | *length*: 65536..4294967295       | str 32           | 0xDB
+    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
+    array           | *size*: 16..65535                 | array 16         | 0xDC
+    array           | *size*: 65536..4294967295         | array 32         | 0xDD
+    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
+    object          | *size*: 16..65535                 | map 16           | 0xDE
+    object          | *size*: 65536..4294967295         | map 32           | 0xDF
 
     @note The mapping is **complete** in the sense that any JSON value type
           can be converted to a MessagePack value.
@@ -13291,10 +17557,10 @@ class basic_json
           - objects with more than 4294967295 elements
 
     @note The following MessagePack types are not used in the conversion:
-          - bin 8 - bin 32 (0xc4..0xc6)
-          - ext 8 - ext 32 (0xc7..0xc9)
-          - float 32 (0xca)
-          - fixext 1 - fixext 16 (0xd4..0xd8)
+          - bin 8 - bin 32 (0xC4..0xC6)
+          - ext 8 - ext 32 (0xC7..0xC9)
+          - float 32 (0xCA)
+          - fixext 1 - fixext 16 (0xD4..0xD8)
 
     @note Any MessagePack output created @ref to_msgpack can be successfully
           parsed by @ref from_msgpack.
@@ -13315,6 +17581,8 @@ class basic_json
     @sa @ref from_msgpack(const std::vector<uint8_t>&, const size_t) for the
         analogous deserialization
     @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
 
     @since version 2.0.9
     */
@@ -13335,6 +17603,107 @@ class basic_json
         binary_writer<char>(o).write_msgpack(j);
     }
 
+    /*!
+    @brief create a UBJSON serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the UBJSON
+    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
+    than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    UBJSON types according to the UBJSON specification:
+
+    JSON value type | value/range                       | UBJSON type | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | `Z`
+    boolean         | `true`                            | true        | `T`
+    boolean         | `false`                           | false       | `F`
+    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
+    number_integer  | -2147483648..-32769               | int32       | `l`
+    number_integer  | -32768..-129                      | int16       | `I`
+    number_integer  | -128..127                         | int8        | `i`
+    number_integer  | 128..255                          | uint8       | `U`
+    number_integer  | 256..32767                        | int16       | `I`
+    number_integer  | 32768..2147483647                 | int32       | `l`
+    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 0..127                            | int8        | `i`
+    number_unsigned | 128..255                          | uint8       | `U`
+    number_unsigned | 256..32767                        | int16       | `I`
+    number_unsigned | 32768..2147483647                 | int32       | `l`
+    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
+    number_float    | *any value*                       | float64     | `D`
+    string          | *with shortest length indicator*  | string      | `S`
+    array           | *see notes on optimized format*   | array       | `[`
+    object          | *see notes on optimized format*   | map         | `{`
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a UBJSON value.
+
+    @note The following values can **not** be converted to a UBJSON value:
+          - strings with more than 9223372036854775807 bytes (theoretical)
+          - unsigned integer numbers above 9223372036854775807
+
+    @note The following markers are not used in the conversion:
+          - `Z`: no-op values are not created.
+          - `C`: single-byte strings are serialized with `S` markers.
+
+    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
+          by @ref from_ubjson.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The optimized formats for containers are supported: Parameter
+          @a use_size adds size information to the beginning of a container and
+          removes the closing marker. Parameter @a use_type further checks
+          whether all elements of a container have the same type and adds the
+          type marker to the beginning of the container. The @a use_type
+          parameter must only be used together with @a use_size = true. Note
+          that @a use_size = true alone may result in larger representations -
+          the benefit of this parameter is that the receiving side is
+          immediately informed on the number of elements of the container.
+
+    @param[in] j  JSON value to serialize
+    @param[in] use_size  whether to add size annotations to container types
+    @param[in] use_type  whether to add type annotations to container types
+                         (must be combined with @a use_size = true)
+    @return UBJSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in UBJSON format.,to_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref from_ubjson(detail::input_adapter, const bool strict) for the
+        analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+
+    @since version 3.1.0
+    */
+    static std::vector<uint8_t> to_ubjson(const basic_json& j,
+                                          const bool use_size = false,
+                                          const bool use_type = false)
+    {
+        std::vector<uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<uint8_t> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
     /*!
     @brief create a JSON value from an input in CBOR format
 
@@ -13348,51 +17717,51 @@ class basic_json
     Integer                | number_unsigned | 0x00..0x17
     Unsigned integer       | number_unsigned | 0x18
     Unsigned integer       | number_unsigned | 0x19
-    Unsigned integer       | number_unsigned | 0x1a
-    Unsigned integer       | number_unsigned | 0x1b
+    Unsigned integer       | number_unsigned | 0x1A
+    Unsigned integer       | number_unsigned | 0x1B
     Negative integer       | number_integer  | 0x20..0x37
     Negative integer       | number_integer  | 0x38
     Negative integer       | number_integer  | 0x39
-    Negative integer       | number_integer  | 0x3a
-    Negative integer       | number_integer  | 0x3b
+    Negative integer       | number_integer  | 0x3A
+    Negative integer       | number_integer  | 0x3B
     Negative integer       | number_integer  | 0x40..0x57
     UTF-8 string           | string          | 0x60..0x77
     UTF-8 string           | string          | 0x78
     UTF-8 string           | string          | 0x79
-    UTF-8 string           | string          | 0x7a
-    UTF-8 string           | string          | 0x7b
-    UTF-8 string           | string          | 0x7f
+    UTF-8 string           | string          | 0x7A
+    UTF-8 string           | string          | 0x7B
+    UTF-8 string           | string          | 0x7F
     array                  | array           | 0x80..0x97
     array                  | array           | 0x98
     array                  | array           | 0x99
-    array                  | array           | 0x9a
-    array                  | array           | 0x9b
-    array                  | array           | 0x9f
-    map                    | object          | 0xa0..0xb7
-    map                    | object          | 0xb8
-    map                    | object          | 0xb9
-    map                    | object          | 0xba
-    map                    | object          | 0xbb
-    map                    | object          | 0xbf
-    False                  | `false`         | 0xf4
-    True                   | `true`          | 0xf5
-    Nill                   | `null`          | 0xf6
-    Half-Precision Float   | number_float    | 0xf9
-    Single-Precision Float | number_float    | 0xfa
-    Double-Precision Float | number_float    | 0xfb
+    array                  | array           | 0x9A
+    array                  | array           | 0x9B
+    array                  | array           | 0x9F
+    map                    | object          | 0xA0..0xB7
+    map                    | object          | 0xB8
+    map                    | object          | 0xB9
+    map                    | object          | 0xBA
+    map                    | object          | 0xBB
+    map                    | object          | 0xBF
+    False                  | `false`         | 0xF4
+    True                   | `true`          | 0xF5
+    Nill                   | `null`          | 0xF6
+    Half-Precision Float   | number_float    | 0xF9
+    Single-Precision Float | number_float    | 0xFA
+    Double-Precision Float | number_float    | 0xFB
 
     @warning The mapping is **incomplete** in the sense that not all CBOR
              types can be converted to a JSON value. The following CBOR types
              are not supported and will yield parse errors (parse_error.112):
-             - byte strings (0x40..0x5f)
-             - date/time (0xc0..0xc1)
-             - bignum (0xc2..0xc3)
-             - decimal fraction (0xc4)
-             - bigfloat (0xc5)
-             - tagged items (0xc6..0xd4, 0xd8..0xdb)
-             - expected conversions (0xd5..0xd7)
-             - simple values (0xe0..0xf3, 0xf8)
-             - undefined (0xf7)
+             - byte strings (0x40..0x5F)
+             - date/time (0xC0..0xC1)
+             - bignum (0xC2..0xC3)
+             - decimal fraction (0xC4)
+             - bigfloat (0xC5)
+             - tagged items (0xC6..0xD4, 0xD8..0xDB)
+             - expected conversions (0xD5..0xD7)
+             - simple values (0xE0..0xF3, 0xF8)
+             - undefined (0xF7)
 
     @warning CBOR allows map keys of any type, whereas JSON only allows
              strings as keys in object values. Therefore, CBOR maps with keys
@@ -13404,6 +17773,9 @@ class basic_json
     @param[in] i  an input in CBOR format convertible to an input adapter
     @param[in] strict  whether to expect the input to be consumed until EOF
                        (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
     @return deserialized JSON value
 
     @throw parse_error.110 if the given input ends prematurely or the end of
@@ -13419,27 +17791,39 @@ class basic_json
 
     @sa http://cbor.io
     @sa @ref to_cbor(const basic_json&) for the analogous serialization
-    @sa @ref from_msgpack(detail::input_adapter, const bool) for the
+    @sa @ref from_msgpack(detail::input_adapter, const bool, const bool) for the
         related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter, const bool, const bool) for the
+        related UBJSON format
 
     @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
            consume input adapters, removed start_index parameter, and added
-           @a strict parameter since 3.0.0
+           @a strict parameter since 3.0.0; added @allow_exceptions parameter
+           since 3.2.0
     */
-    static basic_json from_cbor(detail::input_adapter i,
-                                const bool strict = true)
+    static basic_json from_cbor(detail::input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
     {
-        return binary_reader(i).parse_cbor(strict);
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::cbor, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
     }
 
     /*!
-    @copydoc from_cbor(detail::input_adapter, const bool)
+    @copydoc from_cbor(detail::input_adapter, const bool, const bool)
     */
     template<typename A1, typename A2,
              detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
-    static basic_json from_cbor(A1 && a1, A2 && a2, const bool strict = true)
+    static basic_json from_cbor(A1 && a1, A2 && a2,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
     {
-        return binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).parse_cbor(strict);
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::cbor, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
     }
 
     /*!
@@ -13452,38 +17836,38 @@ class basic_json
 
     MessagePack type | JSON value type | first byte
     ---------------- | --------------- | ----------
-    positive fixint  | number_unsigned | 0x00..0x7f
-    fixmap           | object          | 0x80..0x8f
-    fixarray         | array           | 0x90..0x9f
-    fixstr           | string          | 0xa0..0xbf
-    nil              | `null`          | 0xc0
-    false            | `false`         | 0xc2
-    true             | `true`          | 0xc3
-    float 32         | number_float    | 0xca
-    float 64         | number_float    | 0xcb
-    uint 8           | number_unsigned | 0xcc
-    uint 16          | number_unsigned | 0xcd
-    uint 32          | number_unsigned | 0xce
-    uint 64          | number_unsigned | 0xcf
-    int 8            | number_integer  | 0xd0
-    int 16           | number_integer  | 0xd1
-    int 32           | number_integer  | 0xd2
-    int 64           | number_integer  | 0xd3
-    str 8            | string          | 0xd9
-    str 16           | string          | 0xda
-    str 32           | string          | 0xdb
-    array 16         | array           | 0xdc
-    array 32         | array           | 0xdd
-    map 16           | object          | 0xde
-    map 32           | object          | 0xdf
-    negative fixint  | number_integer  | 0xe0-0xff
+    positive fixint  | number_unsigned | 0x00..0x7F
+    fixmap           | object          | 0x80..0x8F
+    fixarray         | array           | 0x90..0x9F
+    fixstr           | string          | 0xA0..0xBF
+    nil              | `null`          | 0xC0
+    false            | `false`         | 0xC2
+    true             | `true`          | 0xC3
+    float 32         | number_float    | 0xCA
+    float 64         | number_float    | 0xCB
+    uint 8           | number_unsigned | 0xCC
+    uint 16          | number_unsigned | 0xCD
+    uint 32          | number_unsigned | 0xCE
+    uint 64          | number_unsigned | 0xCF
+    int 8            | number_integer  | 0xD0
+    int 16           | number_integer  | 0xD1
+    int 32           | number_integer  | 0xD2
+    int 64           | number_integer  | 0xD3
+    str 8            | string          | 0xD9
+    str 16           | string          | 0xDA
+    str 32           | string          | 0xDB
+    array 16         | array           | 0xDC
+    array 32         | array           | 0xDD
+    map 16           | object          | 0xDE
+    map 32           | object          | 0xDF
+    negative fixint  | number_integer  | 0xE0-0xFF
 
     @warning The mapping is **incomplete** in the sense that not all
              MessagePack types can be converted to a JSON value. The following
              MessagePack types are not supported and will yield parse errors:
-              - bin 8 - bin 32 (0xc4..0xc6)
-              - ext 8 - ext 32 (0xc7..0xc9)
-              - fixext 1 - fixext 16 (0xd4..0xd8)
+              - bin 8 - bin 32 (0xC4..0xC6)
+              - ext 8 - ext 32 (0xC7..0xC9)
+              - fixext 1 - fixext 16 (0xD4..0xD8)
 
     @note Any MessagePack output created @ref to_msgpack can be successfully
           parsed by @ref from_msgpack.
@@ -13492,6 +17876,10 @@ class basic_json
                   adapter
     @param[in] strict  whether to expect the input to be consumed until EOF
                        (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value
 
     @throw parse_error.110 if the given input ends prematurely or the end of
     file was not reached when @a strict was set to true
@@ -13506,27 +17894,121 @@ class basic_json
 
     @sa http://msgpack.org
     @sa @ref to_msgpack(const basic_json&) for the analogous serialization
-    @sa @ref from_cbor(detail::input_adapter, const bool) for the related CBOR
-        format
+    @sa @ref from_cbor(detail::input_adapter, const bool, const bool) for the
+        related CBOR format
+    @sa @ref from_ubjson(detail::input_adapter, const bool, const bool) for
+        the related UBJSON format
 
     @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
            consume input adapters, removed start_index parameter, and added
-           @a strict parameter since 3.0.0
+           @a strict parameter since 3.0.0; added @allow_exceptions parameter
+           since 3.2.0
     */
-    static basic_json from_msgpack(detail::input_adapter i,
-                                   const bool strict = true)
+    static basic_json from_msgpack(detail::input_adapter&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
     {
-        return binary_reader(i).parse_msgpack(strict);
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
     }
 
     /*!
-    @copydoc from_msgpack(detail::input_adapter, const bool)
+    @copydoc from_msgpack(detail::input_adapter, const bool, const bool)
     */
     template<typename A1, typename A2,
              detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
-    static basic_json from_msgpack(A1 && a1, A2 && a2, const bool strict = true)
+    static basic_json from_msgpack(A1 && a1, A2 && a2,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
     {
-        return binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).parse_msgpack(strict);
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief create a JSON value from an input in UBJSON format
+
+    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
+    Binary JSON) serialization format.
+
+    The library maps UBJSON types to JSON value types as follows:
+
+    UBJSON type | JSON value type                         | marker
+    ----------- | --------------------------------------- | ------
+    no-op       | *no value, next value is read*          | `N`
+    null        | `null`                                  | `Z`
+    false       | `false`                                 | `F`
+    true        | `true`                                  | `T`
+    float32     | number_float                            | `d`
+    float64     | number_float                            | `D`
+    uint8       | number_unsigned                         | `U`
+    int8        | number_integer                          | `i`
+    int16       | number_integer                          | `I`
+    int32       | number_integer                          | `l`
+    int64       | number_integer                          | `L`
+    string      | string                                  | `S`
+    char        | string                                  | `C`
+    array       | array (optimized values are supported)  | `[`
+    object      | object (optimized values are supported) | `{`
+
+    @note The mapping is **complete** in the sense that any UBJSON value can
+          be converted to a JSON value.
+
+    @param[in] i  an input in UBJSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if a parse error occurs
+    @throw parse_error.113 if a string could not be parsed successfully
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    UBJSON format to a JSON value.,from_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             analogous serialization
+    @sa @ref from_cbor(detail::input_adapter, const bool, const bool) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter, const bool, const bool) for
+        the related MessagePack format
+
+    @since version 3.1.0; added @allow_exceptions parameter since 3.2.0
+    */
+    static basic_json from_ubjson(detail::input_adapter&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(i)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_ubjson(detail::input_adapter, const bool, const bool)
+    */
+    template<typename A1, typename A2,
+             detail::enable_if_t<std::is_constructible<detail::input_adapter, A1, A2>::value, int> = 0>
+    static basic_json from_ubjson(A1 && a1, A2 && a2,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        const bool res = binary_reader(detail::input_adapter(std::forward<A1>(a1), std::forward<A2>(a2))).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
     }
 
     /// @}
@@ -13627,6 +18109,9 @@ class basic_json
     pointer @a ptr. As `at` provides checked access (and no elements are
     implicitly inserted), the index '-' is always invalid. See example below.
 
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
     @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
     See example below.
 
@@ -13667,6 +18152,9 @@ class basic_json
     pointer @a ptr. As `at` provides checked access (and no elements are
     implicitly inserted), the index '-' is always invalid. See example below.
 
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
     @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
     See example below.
 
@@ -13882,7 +18370,7 @@ class basic_json
                         }
                         else
                         {
-                            const auto idx = std::stoi(last_path);
+                            const auto idx = json_pointer::array_index(last_path);
                             if (JSON_UNLIKELY(static_cast<size_type>(idx) > parent.size()))
                             {
                                 // avoid undefined behavior
@@ -13897,11 +18385,13 @@ class basic_json
                         break;
                     }
 
+                    // LCOV_EXCL_START
                     default:
                     {
                         // if there exists a parent it cannot be primitive
-                        assert(false);  // LCOV_EXCL_LINE
+                        assert(false);
                     }
+                        // LCOV_EXCL_STOP
                 }
             }
         };
@@ -13930,7 +18420,7 @@ class basic_json
             else if (parent.is_array())
             {
                 // note erase performs range check
-                parent.erase(static_cast<size_type>(std::stoi(last_path)));
+                parent.erase(static_cast<size_type>(json_pointer::array_index(last_path)));
             }
         };
 
@@ -13946,7 +18436,7 @@ class basic_json
             // wrapper to get a value for an operation
             const auto get_value = [&val](const std::string & op,
                                           const std::string & member,
-                                          bool string_type) -> basic_json&
+                                          bool string_type) -> basic_json &
             {
                 // find value
                 auto it = val.m_value.object->find(member);
@@ -14025,7 +18515,12 @@ class basic_json
                     const json_pointer from_ptr(from_path);
 
                     // the "from" location must exist - use at()
-                    result[ptr] = result.at(from_ptr);
+                    basic_json v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
                     break;
                 }
 
@@ -14038,7 +18533,7 @@ class basic_json
                         // the "path" location must exist - use at()
                         success = (result.at(ptr) == get_value("test", "value", false));
                     }
-                    JSON_CATCH (out_of_range&)
+                    JSON_INTERNAL_CATCH (out_of_range&)
                     {
                         // ignore out of range errors: success remains false
                     }
@@ -14091,6 +18586,7 @@ class basic_json
     diff for two JSON values.,diff}
 
     @sa @ref patch -- apply a JSON patch
+    @sa @ref merge_patch -- apply a JSON Merge Patch
 
     @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
 
@@ -14167,7 +18663,7 @@ class basic_json
                 case value_t::object:
                 {
                     // first pass: traverse this object's elements
-                    for (auto it = source.begin(); it != source.end(); ++it)
+                    for (auto it = source.cbegin(); it != source.cend(); ++it)
                     {
                         // escape the key name to be used in a JSON patch
                         const auto key = json_pointer::escape(it.key());
@@ -14189,7 +18685,7 @@ class basic_json
                     }
 
                     // second pass: traverse other object's elements
-                    for (auto it = target.begin(); it != target.end(); ++it)
+                    for (auto it = target.cbegin(); it != target.cend(); ++it)
                     {
                         if (source.find(it.key()) == source.end())
                         {
@@ -14222,418 +18718,86 @@ class basic_json
     }
 
     /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON Merge Patch
+
+    The merge patch format is primarily intended for use with the HTTP PATCH
+    method as a means of describing a set of modifications to a target
+    resource's content. This function applies a merge patch to the current
+    JSON value.
+
+    The function implements the following algorithm from Section 2 of
+    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):
+
+    ```
+    define MergePatch(Target, Patch):
+      if Patch is an Object:
+        if Target is not an Object:
+          Target = {} // Ignore the contents and set it to an empty Object
+        for each Name/Value pair in Patch:
+          if Value is null:
+            if Name exists in Target:
+              remove the Name/Value pair from Target
+          else:
+            Target[Name] = MergePatch(Target[Name], Value)
+        return Target
+      else:
+        return Patch
+    ```
+
+    Thereby, `Target` is the current object; that is, the patch is applied to
+    the current value.
+
+    @param[in] patch  the patch to apply
+
+    @complexity Linear in the lengths of @a patch.
+
+    @liveexample{The following code shows how a JSON Merge Patch is applied to
+    a JSON document.,merge_patch}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)
+
+    @since version 3.0.0
+    */
+    void merge_patch(const basic_json& patch)
+    {
+        if (patch.is_object())
+        {
+            if (not is_object())
+            {
+                *this = object();
+            }
+            for (auto it = patch.begin(); it != patch.end(); ++it)
+            {
+                if (it.value().is_null())
+                {
+                    erase(it.key());
+                }
+                else
+                {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        }
+        else
+        {
+            *this = patch;
+        }
+    }
+
+    /// @}
 };
-
-/////////////
-// presets //
-/////////////
-
-/*!
-@brief default JSON class
-
-This type is the default specialization of the @ref basic_json class which
-uses the standard template types.
-
-@since version 1.0.0
-*/
-using json = basic_json<>;
-
-//////////////////
-// json_pointer //
-//////////////////
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-NLOHMANN_BASIC_JSON_TPL&
-json_pointer::get_and_create(NLOHMANN_BASIC_JSON_TPL& j) const
-{
-    using size_type = typename NLOHMANN_BASIC_JSON_TPL::size_type;
-    auto result = &j;
-
-    // in case no reference tokens exist, return a reference to the JSON value
-    // j which will be overwritten by a primitive value
-    for (const auto& reference_token : reference_tokens)
-    {
-        switch (result->m_type)
-        {
-            case detail::value_t::null:
-            {
-                if (reference_token == "0")
-                {
-                    // start a new array if reference token is 0
-                    result = &result->operator[](0);
-                }
-                else
-                {
-                    // start a new object otherwise
-                    result = &result->operator[](reference_token);
-                }
-                break;
-            }
-
-            case detail::value_t::object:
-            {
-                // create an entry in the object
-                result = &result->operator[](reference_token);
-                break;
-            }
-
-            case detail::value_t::array:
-            {
-                // create an entry in the array
-                JSON_TRY
-                {
-                    result = &result->operator[](static_cast<size_type>(std::stoi(reference_token)));
-                }
-                JSON_CATCH(std::invalid_argument&)
-                {
-                    JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
-                }
-                break;
-            }
-
-            /*
-            The following code is only reached if there exists a reference
-            token _and_ the current value is primitive. In this case, we have
-            an error situation, because primitive values may only occur as
-            single value; that is, with an empty list of reference tokens.
-            */
-            default:
-                JSON_THROW(detail::type_error::create(313, "invalid value to unflatten"));
-        }
-    }
-
-    return *result;
-}
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-NLOHMANN_BASIC_JSON_TPL&
-json_pointer::get_unchecked(NLOHMANN_BASIC_JSON_TPL* ptr) const
-{
-    using size_type = typename NLOHMANN_BASIC_JSON_TPL::size_type;
-    for (const auto& reference_token : reference_tokens)
-    {
-        // convert null values to arrays or objects before continuing
-        if (ptr->m_type == detail::value_t::null)
-        {
-            // check if reference token is a number
-            const bool nums =
-                std::all_of(reference_token.begin(), reference_token.end(),
-                            [](const char x)
-            {
-                return (x >= '0' and x <= '9');
-            });
-
-            // change value to array for numbers or "-" or to object otherwise
-            *ptr = (nums or reference_token == "-")
-                   ? detail::value_t::array
-                   : detail::value_t::object;
-        }
-
-        switch (ptr->m_type)
-        {
-            case detail::value_t::object:
-            {
-                // use unchecked object access
-                ptr = &ptr->operator[](reference_token);
-                break;
-            }
-
-            case detail::value_t::array:
-            {
-                // error condition (cf. RFC 6901, Sect. 4)
-                if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
-                {
-                    JSON_THROW(detail::parse_error::create(106, 0,
-                                                           "array index '" + reference_token +
-                                                           "' must not begin with '0'"));
-                }
-
-                if (reference_token == "-")
-                {
-                    // explicitly treat "-" as index beyond the end
-                    ptr = &ptr->operator[](ptr->m_value.array->size());
-                }
-                else
-                {
-                    // convert array index to number; unchecked access
-                    JSON_TRY
-                    {
-                        ptr = &ptr->operator[](
-                            static_cast<size_type>(std::stoi(reference_token)));
-                    }
-                    JSON_CATCH(std::invalid_argument&)
-                    {
-                        JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
-                    }
-                }
-                break;
-            }
-
-            default:
-                JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
-        }
-    }
-
-    return *ptr;
-}
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-NLOHMANN_BASIC_JSON_TPL&
-json_pointer::get_checked(NLOHMANN_BASIC_JSON_TPL* ptr) const
-{
-    using size_type = typename NLOHMANN_BASIC_JSON_TPL::size_type;
-    for (const auto& reference_token : reference_tokens)
-    {
-        switch (ptr->m_type)
-        {
-            case detail::value_t::object:
-            {
-                // note: at performs range check
-                ptr = &ptr->at(reference_token);
-                break;
-            }
-
-            case detail::value_t::array:
-            {
-                if (JSON_UNLIKELY(reference_token == "-"))
-                {
-                    // "-" always fails the range check
-                    JSON_THROW(detail::out_of_range::create(402,
-                                                            "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
-                                                            ") is out of range"));
-                }
-
-                // error condition (cf. RFC 6901, Sect. 4)
-                if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
-                {
-                    JSON_THROW(detail::parse_error::create(106, 0,
-                                                           "array index '" + reference_token +
-                                                           "' must not begin with '0'"));
-                }
-
-                // note: at performs range check
-                JSON_TRY
-                {
-                    ptr = &ptr->at(static_cast<size_type>(std::stoi(reference_token)));
-                }
-                JSON_CATCH(std::invalid_argument&)
-                {
-                    JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
-                }
-                break;
-            }
-
-            default:
-                JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
-        }
-    }
-
-    return *ptr;
-}
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-const NLOHMANN_BASIC_JSON_TPL&
-json_pointer::get_unchecked(const NLOHMANN_BASIC_JSON_TPL* ptr) const
-{
-    using size_type = typename NLOHMANN_BASIC_JSON_TPL::size_type;
-    for (const auto& reference_token : reference_tokens)
-    {
-        switch (ptr->m_type)
-        {
-            case detail::value_t::object:
-            {
-                // use unchecked object access
-                ptr = &ptr->operator[](reference_token);
-                break;
-            }
-
-            case detail::value_t::array:
-            {
-                if (JSON_UNLIKELY(reference_token == "-"))
-                {
-                    // "-" cannot be used for const access
-                    JSON_THROW(detail::out_of_range::create(402,
-                                                            "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
-                                                            ") is out of range"));
-                }
-
-                // error condition (cf. RFC 6901, Sect. 4)
-                if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
-                {
-                    JSON_THROW(detail::parse_error::create(106, 0,
-                                                           "array index '" + reference_token +
-                                                           "' must not begin with '0'"));
-                }
-
-                // use unchecked array access
-                JSON_TRY
-                {
-                    ptr = &ptr->operator[](
-                        static_cast<size_type>(std::stoi(reference_token)));
-                }
-                JSON_CATCH(std::invalid_argument&)
-                {
-                    JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
-                }
-                break;
-            }
-
-            default:
-                JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
-        }
-    }
-
-    return *ptr;
-}
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-const NLOHMANN_BASIC_JSON_TPL&
-json_pointer::get_checked(const NLOHMANN_BASIC_JSON_TPL* ptr) const
-{
-    using size_type = typename NLOHMANN_BASIC_JSON_TPL::size_type;
-    for (const auto& reference_token : reference_tokens)
-    {
-        switch (ptr->m_type)
-        {
-            case detail::value_t::object:
-            {
-                // note: at performs range check
-                ptr = &ptr->at(reference_token);
-                break;
-            }
-
-            case detail::value_t::array:
-            {
-                if (JSON_UNLIKELY(reference_token == "-"))
-                {
-                    // "-" always fails the range check
-                    JSON_THROW(detail::out_of_range::create(402,
-                                                            "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
-                                                            ") is out of range"));
-                }
-
-                // error condition (cf. RFC 6901, Sect. 4)
-                if (JSON_UNLIKELY(reference_token.size() > 1 and reference_token[0] == '0'))
-                {
-                    JSON_THROW(detail::parse_error::create(106, 0,
-                                                           "array index '" + reference_token +
-                                                           "' must not begin with '0'"));
-                }
-
-                // note: at performs range check
-                JSON_TRY
-                {
-                    ptr = &ptr->at(static_cast<size_type>(std::stoi(reference_token)));
-                }
-                JSON_CATCH(std::invalid_argument&)
-                {
-                    JSON_THROW(detail::parse_error::create(109, 0, "array index '" + reference_token + "' is not a number"));
-                }
-                break;
-            }
-
-            default:
-                JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
-        }
-    }
-
-    return *ptr;
-}
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-void json_pointer::flatten(const std::string& reference_string,
-                           const NLOHMANN_BASIC_JSON_TPL& value,
-                           NLOHMANN_BASIC_JSON_TPL& result)
-{
-    switch (value.m_type)
-    {
-        case detail::value_t::array:
-        {
-            if (value.m_value.array->empty())
-            {
-                // flatten empty array as null
-                result[reference_string] = nullptr;
-            }
-            else
-            {
-                // iterate array and use index as reference string
-                for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
-                {
-                    flatten(reference_string + "/" + std::to_string(i),
-                            value.m_value.array->operator[](i), result);
-                }
-            }
-            break;
-        }
-
-        case detail::value_t::object:
-        {
-            if (value.m_value.object->empty())
-            {
-                // flatten empty object as null
-                result[reference_string] = nullptr;
-            }
-            else
-            {
-                // iterate object and use keys as reference string
-                for (const auto& element : *value.m_value.object)
-                {
-                    flatten(reference_string + "/" + escape(element.first), element.second, result);
-                }
-            }
-            break;
-        }
-
-        default:
-        {
-            // add primitive value with its reference string
-            result[reference_string] = value;
-            break;
-        }
-    }
-}
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-NLOHMANN_BASIC_JSON_TPL
-json_pointer::unflatten(const NLOHMANN_BASIC_JSON_TPL& value)
-{
-    if (JSON_UNLIKELY(not value.is_object()))
-    {
-        JSON_THROW(detail::type_error::create(314, "only objects can be unflattened"));
-    }
-
-    NLOHMANN_BASIC_JSON_TPL result;
-
-    // iterate the JSON object values
-    for (const auto& element : *value.m_value.object)
-    {
-        if (JSON_UNLIKELY(not element.second.is_primitive()))
-        {
-            JSON_THROW(detail::type_error::create(315, "values in object must be primitive"));
-        }
-
-        // assign value to reference pointed to by JSON pointer; Note that if
-        // the JSON pointer is "" (i.e., points to the whole value), function
-        // get_and_create returns a reference to result itself. An assignment
-        // will then create a primitive value.
-        json_pointer(element.first).get_and_create(result) = element.second;
-    }
-
-    return result;
-}
-
-inline bool operator==(json_pointer const& lhs, json_pointer const& rhs) noexcept
-{
-    return (lhs.reference_tokens == rhs.reference_tokens);
-}
-
-inline bool operator!=(json_pointer const& lhs, json_pointer const& rhs) noexcept
-{
-    return not (lhs == rhs);
-}
 } // namespace nlohmann
 
-
 ///////////////////////
 // nonmember support //
 ///////////////////////
@@ -14641,20 +18805,6 @@ inline bool operator!=(json_pointer const& lhs, json_pointer const& rhs) noexcep
 // specialization of std::swap, and std::hash
 namespace std
 {
-/*!
-@brief exchanges the values of two JSON objects
-
-@since version 1.0.0
-*/
-template<>
-inline void swap(nlohmann::json& j1,
-                 nlohmann::json& j2) noexcept(
-                     is_nothrow_move_constructible<nlohmann::json>::value and
-                     is_nothrow_move_assignable<nlohmann::json>::value
-                 )
-{
-    j1.swap(j2);
-}
 
 /// hash value for JSON objects
 template<>
@@ -14690,6 +18840,20 @@ struct less< ::nlohmann::detail::value_t>
     }
 };
 
+/*!
+@brief exchanges the values of two JSON objects
+
+@since version 1.0.0
+*/
+template<>
+inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept(
+    is_nothrow_move_constructible<nlohmann::json>::value and
+    is_nothrow_move_assignable<nlohmann::json>::value
+)
+{
+    j1.swap(j2);
+}
+
 } // namespace std
 
 /*!
@@ -14728,6 +18892,9 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std
     return nlohmann::json::json_pointer(std::string(s, n));
 }
 
+// #include <nlohmann/detail/macro_unscope.hpp>
+
+
 // restore GCC/clang diagnostic settings
 #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
     #pragma GCC diagnostic pop
@@ -14737,13 +18904,17 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std
 #endif
 
 // clean up
+#undef JSON_INTERNAL_CATCH
 #undef JSON_CATCH
 #undef JSON_THROW
 #undef JSON_TRY
 #undef JSON_LIKELY
 #undef JSON_UNLIKELY
 #undef JSON_DEPRECATED
+#undef JSON_HAS_CPP_14
+#undef JSON_HAS_CPP_17
 #undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
 #undef NLOHMANN_BASIC_JSON_TPL
 
+
 #endif
diff --git a/lib/lattice/Lattice.h b/Grid/lattice/Lattice.h
similarity index 100%
rename from lib/lattice/Lattice.h
rename to Grid/lattice/Lattice.h
diff --git a/lib/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h
similarity index 100%
rename from lib/lattice/Lattice_ET.h
rename to Grid/lattice/Lattice_ET.h
diff --git a/lib/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h
similarity index 96%
rename from lib/lattice/Lattice_arith.h
rename to Grid/lattice/Lattice_arith.h
index 117e7e6d..203c3826 100644
--- a/lib/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -300,20 +300,14 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 }
 
 template<class sobj,class vobj> inline
-RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
-  ret.Checkerboard() = x.Checkerboard();
-  conformable(ret,x);
-  conformable(x,y);
-  axpy(ret,a,x,y);
-  return norm2(ret);
+RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
+{
+    return axpy_norm_fast(ret,a,x,y);
 }
 template<class sobj,class vobj> inline
-RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
-  ret.Checkerboard() = x.Checkerboard();
-  conformable(ret,x);
-  conformable(x,y);
-  axpby(ret,a,b,x,y);
-  return norm2(ret); // FIXME implement parallel norm in ss loop
+RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
+{
+    return axpby_norm_fast(ret,a,b,x,y);
 }
 
 NAMESPACE_END(Grid);
diff --git a/lib/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
similarity index 98%
rename from lib/lattice/Lattice_base.h
rename to Grid/lattice/Lattice_base.h
index cfa556ff..b3b85791 100644
--- a/lib/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -362,6 +362,16 @@ public:
     assert((((uint64_t)&this->_odata[0])&0xF) ==0);
     this->checkerboard=0;
   }
+  
+  //  virtual ~Lattice(void) = default;
+    
+  void reset(GridBase* grid) {
+    if (this->_grid != grid) {
+      this->_grid = grid;
+      this->_odata.resize(grid->oSites());
+      this->checkerboard = 0;
+    }
+  }
   ///////////////////////////////////////////
   // copy constructor
   ///////////////////////////////////////////
@@ -396,6 +406,7 @@ public:
     });
     return *this;
   }
+
   ///////////////////////////////////////////
   // Copy assignment 
   ///////////////////////////////////////////
@@ -451,7 +462,6 @@ public:
     tmp = *lp;    *lp=*rp;    *rp=tmp;
   }
 
-
 }; // class Lattice
 
 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
diff --git a/lib/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h
similarity index 100%
rename from lib/lattice/Lattice_comparison.h
rename to Grid/lattice/Lattice_comparison.h
diff --git a/lib/lattice/Lattice_comparison_utils.h b/Grid/lattice/Lattice_comparison_utils.h
similarity index 97%
rename from lib/lattice/Lattice_comparison_utils.h
rename to Grid/lattice/Lattice_comparison_utils.h
index 408961ec..7b1bc19f 100644
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/Grid/lattice/Lattice_comparison_utils.h
@@ -179,7 +179,7 @@ accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar
   return ret;
 }
 
-#define DECLARE_RELATIONAL(op,functor)					\
+#define DECLARE_RELATIONAL_EQ(op,functor) \
   template<class vsimd,IfSimd<vsimd> = 0>				\
   accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)	\
   {									\
@@ -212,14 +212,15 @@ accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar
   accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
   {									\
     return lhs op rhs._internal;					\
-  }									
+    }									\
 
+#define DECLARE_RELATIONAL(op,functor) DECLARE_RELATIONAL_EQ(op,functor)    
 
 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
-DECLARE_RELATIONAL(==,seq);
+DECLARE_RELATIONAL_EQ(==,seq);
 DECLARE_RELATIONAL(!=,sne);
 
 #undef DECLARE_RELATIONAL
diff --git a/lib/lattice/Lattice_conformable.h b/Grid/lattice/Lattice_conformable.h
similarity index 100%
rename from lib/lattice/Lattice_conformable.h
rename to Grid/lattice/Lattice_conformable.h
diff --git a/lib/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h
similarity index 96%
rename from lib/lattice/Lattice_coordinate.h
rename to Grid/lattice/Lattice_coordinate.h
index c83e91a2..16f3641b 100644
--- a/lib/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -25,8 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_LATTICE_COORDINATE_H
-#define GRID_LATTICE_COORDINATE_H
+#pragma once 
 
 NAMESPACE_BEGIN(Grid);
 
@@ -72,4 +71,4 @@ template<class vobj> void lex_sites(Lattice<vobj> &l){
 }
 
 NAMESPACE_END(Grid);
-#endif
+
diff --git a/lib/lattice/Lattice_local.h b/Grid/lattice/Lattice_local.h
similarity index 100%
rename from lib/lattice/Lattice_local.h
rename to Grid/lattice/Lattice_local.h
diff --git a/lib/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h
similarity index 100%
rename from lib/lattice/Lattice_matrix_reduction.h
rename to Grid/lattice/Lattice_matrix_reduction.h
diff --git a/Grid/lattice/Lattice_overload.h b/Grid/lattice/Lattice_overload.h
new file mode 100644
index 00000000..0906b610
--- /dev/null
+++ b/Grid/lattice/Lattice_overload.h
@@ -0,0 +1,138 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_overload.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_OVERLOAD_H
+#define GRID_LATTICE_OVERLOAD_H
+
+namespace Grid {
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // unary negation
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
+  {
+    Lattice<vobj> ret(r._grid);
+    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
+      vstream(ret._odata[ss], -r._odata[ss]);
+    }
+    return ret;
+  } 
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Lattice BinOp Lattice,
+  //NB mult performs conformable check. Do not reapply here for performance.
+  /////////////////////////////////////////////////////////////////////////////////////
+  template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    mult(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
+    add(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
+    sub(ret,lhs,rhs);
+    return ret;
+  }
+  
+  // Scalar BinOp Lattice ;generate return type
+  template<class left,class right>
+  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
+      vstream(ret._odata[ss],tmp);
+	   //      ret._odata[ss]=lhs*rhs._odata[ss];
+    }
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
+    {
+      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
+	vstream(ret._odata[ss],tmp);
+	//	ret._odata[ss]=lhs+rhs._odata[ss];
+      }
+        return ret;
+    }
+  template<class left,class right>
+    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
+  {
+    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
+      vstream(ret._odata[ss],tmp);
+    }
+    return ret;
+  }
+    template<class left,class right>
+      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
+	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
+	vstream(ret._odata[ss],tmp);
+	//            ret._odata[ss]=lhs._odata[ss]*rhs;
+      }
+      return ret;
+    }
+    template<class left,class right>
+      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
+	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
+	  vstream(ret._odata[ss],tmp);
+	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
+	  vstream(ret._odata[ss],tmp);
+	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
+      }
+      return ret;
+    }
+}
+#endif
diff --git a/lib/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h
similarity index 100%
rename from lib/lattice/Lattice_peekpoke.h
rename to Grid/lattice/Lattice_peekpoke.h
diff --git a/lib/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h
similarity index 100%
rename from lib/lattice/Lattice_reality.h
rename to Grid/lattice/Lattice_reality.h
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
new file mode 100644
index 00000000..be2cbf44
--- /dev/null
+++ b/Grid/lattice/Lattice_reduction.h
@@ -0,0 +1,738 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./lib/lattice/Lattice_reduction.h
+    Copyright (C) 2015
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+#include <Grid/Grid_Eigen_Dense.h>
+
+NAMESPACE_BEGIN(Grid);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Deterministic Reduction operations
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
+  ComplexD nrm = innerProduct(arg,arg);
+  return real(nrm); 
+}
+
+// Double inner product
+template<class vobj>
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  scalar_type  nrm;
+  
+  GridBase *grid = left.Grid();
+  
+  Vector<vector_type> sumarray(grid->SumArraySize());
+
+  auto left_v = left.View();
+  auto right_v=right.View();
+
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
+    int mywork, myoff;
+    GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
+    
+    decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vnrm = vnrm + innerProductD(left_v[ss],right_v[ss]);
+    }
+    sumarray[thr]=TensorRemove(vnrm) ;
+  });
+  
+  vector_type vvnrm; vvnrm=Zero();  // sum across threads
+  for(int i=0;i<grid->SumArraySize();i++){
+    vvnrm = vvnrm+sumarray[i];
+  } 
+  nrm = Reduce(vvnrm);// sum across simd
+  right.Grid()->GlobalSum(nrm);
+  return nrm;
+}
+
+/////////////////////////
+// Fast axpby_norm
+// z = a x + b y
+// return norm z
+/////////////////////////
+template<class sobj,class vobj> strong_inline RealD 
+axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  sobj one(1.0);
+  return axpby_norm_fast(z,a,one,x,y);
+}
+
+template<class sobj,class vobj> strong_inline RealD 
+axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  const int pad = 8;
+  z.Checkerboard() = x.Checkerboard();
+  conformable(z,x);
+  conformable(x,y);
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  RealD  nrm;
+  
+  GridBase *grid = x.Grid();
+  
+  Vector<RealD> sumarray(grid->SumArraySize()*pad);
+  
+  auto x_v=x.View();
+  auto y_v=y.View();
+  auto z_v=z.View();
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
+  {
+    int nwork, mywork, myoff;
+    GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
+    
+    // private to thread; sub summation
+    decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); 
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vobj tmp = a*x_v[ss]+b*y_v[ss];
+      vnrm = vnrm + innerProductD(tmp,tmp);
+      vstream(z_v[ss],tmp);
+    }
+    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
+  });
+  
+  nrm = 0.0; // sum across threads; linear in thread count but fast
+  for(int i=0;i<grid->SumArraySize();i++){
+    nrm = nrm+sumarray[i*pad];
+  } 
+  z.Grid()->GlobalSum(nrm);
+  return nrm; 
+}
+
+ 
+template<class Op,class T1>
+inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+      ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+  ->typename decltype(expr.op.func(eval(0,expr.arg1),
+				      eval(0,expr.arg2),
+				      eval(0,expr.arg3)
+				      ))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+{
+  GridBase *grid=arg.Grid();
+  int Nsimd = grid->Nsimd();
+  
+  Vector<vobj> sumarray(grid->SumArraySize());
+  for(int i=0;i<grid->SumArraySize();i++){
+    sumarray[i]=Zero();
+  }
+  
+  auto arg_v=arg.View();
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
+    int nwork, mywork, myoff;
+    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    
+    vobj vvsum=Zero();
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vvsum = vvsum + arg_v[ss];
+    }
+    sumarray[thr]=vvsum;
+  });
+  
+  vobj vsum=Zero();  // sum across threads
+  for(int i=0;i<grid->SumArraySize();i++){
+    vsum = vsum+sumarray[i];
+  } 
+  
+  typedef typename vobj::scalar_object sobj;
+  sobj ssum=Zero();
+  
+  ExtractBuffer<sobj>               buf(Nsimd);
+  extract(vsum,buf);
+  
+  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
+  arg.Grid()->GlobalSum(ssum);
+  
+  return ssum;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+{
+  ///////////////////////////////////////////////////////
+  // FIXME precision promoted summation
+  // may be important for correlation functions
+  // But easily avoided by using double precision fields
+  ///////////////////////////////////////////////////////
+  typedef typename vobj::scalar_object sobj;
+  GridBase  *grid = Data.Grid();
+  assert(grid!=NULL);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  Vector<vobj> lvSum(rd); // will locally sum vectors first
+  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node 
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  auto Data_v=Data.View();
+  thread_loop( (int r=0;r<rd;r++), {
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	lvSum[r]=lvSum[r]+Data_v[ss];
+      }
+    }
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+
+  for(int rt=0;rt<rd;rt++){
+
+    extract(lvSum[rt],extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx];
+
+    }
+  }
+  
+  // sum over nodes.
+  sobj gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=Zero();
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+
+template<class vobj>
+static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
+
+  typedef typename vobj::scalar_type scalar_type;
+  std::vector<scalar_type> lsSum;
+  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
+  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
+  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
+}
+
+template <class vobj>
+static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start prep" << std::endl;
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs.Grid();
+  assert(grid!=NULL);
+  conformable(grid,rhs.Grid());
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+  // std::cout << GridLogMessage << "Start alloc" << std::endl;
+
+  Vector<vector_type> lvSum(rd); // will locally sum vectors first
+  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
+  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
+  // std::cout << GridLogMessage << "End alloc" << std::endl;
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  // std::cout << GridLogMessage << "End prep" << std::endl;
+  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
+  vector_type vv;
+  auto l_v=lhs.View();
+  auto r_v=rhs.View();
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int ss = so + n * stride + b;
+        vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
+        lvSum[r] = lvSum[r] + vv;
+      }
+    }
+  });
+  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
+}
+template <class vobj>
+static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  GridBase *grid = lhs.Grid();
+  int fd = result.size();
+  int ld = lsSum.size();
+  // sum over nodes.
+  std::vector<scalar_type> gsum;
+  gsum.resize(fd, scalar_type(0.0));
+  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum[t]=lsSum[lt];
+    }
+  }
+  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
+  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
+  grid->GlobalSumVector(&gsum[0], fd);
+  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
+
+  result = gsum;
+}
+template<class vobj>
+static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
+{
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs.Grid();
+  assert(grid!=NULL);
+  conformable(grid,rhs.Grid());
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  Vector<vector_type> lvSum(rd); // will locally sum vectors first
+  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  auto lhv=lhs.View();
+  auto rhv=rhs.View();
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
+	lvSum[r]=lvSum[r]+vv;
+      }
+    }
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  
+  // sum over nodes.
+  scalar_type gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=scalar_type(0.0);
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+template<class vobj>
+static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
+  Vector<ComplexD> ip(Nblock);
+  sn.resize(Nblock);
+  
+  sliceInnerProductVector(ip,rhs,rhs,Orthog);
+  for(int ss=0;ss<Nblock;ss++){
+    sn[ss] = real(ip[ss]);
+  }
+};
+
+
+template<class vobj>
+static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
+			    int orthogdim,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::tensor_reduced tensor_reduced;
+  
+  scalar_type zscale(scale);
+
+  GridBase *grid  = X.Grid();
+
+  int Nsimd  =grid->Nsimd();
+  int Nblock =grid->GlobalDimensions()[orthogdim];
+
+  int fd     =grid->_fdimensions[orthogdim];
+  int ld     =grid->_ldimensions[orthogdim];
+  int rd     =grid->_rdimensions[orthogdim];
+
+  int e1     =grid->_slice_nblock[orthogdim];
+  int e2     =grid->_slice_block [orthogdim];
+  int stride =grid->_slice_stride[orthogdim];
+
+  Coordinate icoor;
+  for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    vector_type    av;
+
+    for(int l=0;l<Nsimd;l++){
+      grid->iCoorFromIindex(icoor,l);
+      int ldx =r+icoor[orthogdim]*rd;
+      scalar_type *as =(scalar_type *)&av;
+      as[l] = scalar_type(a[ldx])*zscale;
+    }
+
+    tensor_reduced at; at=av;
+
+    auto Rv=R.View();
+    auto Xv=X.View();
+    auto Yv=Y.View();
+    thread_loop_collapse2( (int n=0;n<e1;n++) , {
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	Rv[ss] = at*Xv[ss]+Yv[ss];
+      }
+    });
+  }
+};
+
+/*
+inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
+{
+  int NN    = BlockSolverGrid->_ndimension;
+  int nsimd = BlockSolverGrid->Nsimd();
+  
+  std::vector<int> latt_phys(0);
+  std::vector<int> simd_phys(0);
+  std::vector<int>  mpi_phys(0);
+  
+  for(int d=0;d<NN;d++){
+    if( d!=Orthog ) { 
+      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+    }
+  }
+  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+}
+*/
+
+template<class vobj>
+static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+  thread_region
+  {
+    Vector<vobj> s_x(Nblock);
+
+    thread_loop_collapse_in_region(2 ,(int n=0;n<nblock;n++), {
+     for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = Y[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }});
+  }
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+  thread_region
+  {
+    std::vector<vobj> s_x(Nblock);
+
+
+    thread_loop_collapse_in_region( 2 , (int n=0;n<nblock;n++),{
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }});
+  }
+};
+
+
+template<class vobj>
+static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  GridBase *FullGrid  = lhs.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+  
+  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+  thread_region
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+    thread_loop_collapse_in_region( 2, (int n=0;n<nblock;n++),{
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs[o+i*ostride];
+	Right[i] = rhs[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	auto rtmp = TensorRemove(tmp);
+	mat_thread(i,j) += Reduce(rtmp);
+      }}
+    }});
+    thread_critical
+    {
+      mat += mat_thread;
+    }  
+  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
+  return;
+}
+
+NAMESPACE_END(Grid);
+
+
+
+
diff --git a/lib/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h
similarity index 94%
rename from lib/lattice/Lattice_rng.h
rename to Grid/lattice/Lattice_rng.h
index 78e59332..a9f8d6ee 100644
--- a/lib/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -158,10 +158,19 @@ public:
     // tens of seconds per trajectory so this is clean in all reasonable cases,
     // and margin of safety is orders of magnitude.
     // We could hack Sitmo to skip in the higher order words of state if necessary
+      //
+      // Replace with 2^30 ; avoid problem on large volumes
+      //
     /////////////////////////////////////////////////////////////////////////////////////
     //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
+      const int shift = 30;
+
     uint64_t skip = site;
-    skip = skip<<40;
+
+      skip = skip<<shift;
+
+      assert((skip >> shift)==site); // check for overflow
+
     eng.discard(skip);
     //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
   } 
@@ -308,6 +317,19 @@ public:
     std::seed_seq src(seeds.begin(),seeds.end());
     Seed(src,0);
   }
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      std::stringstream sha;
+      seeds = GridChecksum::sha256_seeds(s);
+      for(int i=0;i<seeds.size();i++) { 
+        sha << std::hex << seeds[i];
+      }
+      std::cout << GridLogMessage << "Intialising serial RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      SeedFixedIntegers(seeds);
+    }
 };
 
 class GridParallelRNG : public GridRNGbase {
@@ -370,6 +392,14 @@ public:
     _time_counter += usecond()- inner_time_counter;
   }
 
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      seeds = GridChecksum::sha256_seeds(s);
+      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
+      SeedFixedIntegers(seeds);
+    }
   void SeedFixedIntegers(const std::vector<int> &seeds){
 
     // Everyone generates the same seed_seq based on input seeds
diff --git a/lib/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h
similarity index 100%
rename from lib/lattice/Lattice_trace.h
rename to Grid/lattice/Lattice_trace.h
diff --git a/lib/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
similarity index 90%
rename from lib/lattice/Lattice_transfer.h
rename to Grid/lattice/Lattice_transfer.h
index 821890a5..4f46fb60 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -25,8 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_LATTICE_TRANSFER_H
-#define GRID_LATTICE_TRANSFER_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -476,9 +475,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
   assert(orthog>=0);
 
   for(int d=0;d<nh;d++){
+    if ( d!=orthog ) {
     assert(lg->_processors[d]  == hg->_processors[d]);
     assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
   }
+  }
 
   // the above should guarantee that the operations are local
   thread_loop( (int idx=0;idx<lg->lSites();idx++),{
@@ -497,7 +498,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
 
 
 template<class vobj>
-void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
   typedef typename vobj::scalar_object sobj;
 
@@ -511,9 +512,11 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
   assert(orthog>=0);
 
   for(int d=0;d<nh;d++){
+    if ( d!=orthog ) {
     assert(lg->_processors[d]  == hg->_processors[d]);
     assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
   }
+  }
 
   // the above should guarantee that the operations are local
   thread_loop( (int idx=0;idx<lg->lSites();idx++),{
@@ -616,6 +619,51 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
     extract(in_vobj, out_ptrs, 0);
   });
 }
+
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type 
+unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
+{
+
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* in_grid = in._grid;
+  out.resize(in_grid->lSites());
+  
+  int ndim = in_grid->Nd();
+  int in_nsimd = vtype::Nsimd();
+
+  std::vector<Coordinate > in_icoor(in_nsimd);
+      
+  for(int lane=0; lane < in_nsimd; lane++){
+    in_icoor[lane].resize(ndim);
+    in_grid->iCoorFromIindex(in_icoor[lane], lane);
+  }
+  
+  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> out_ptrs(in_nsimd);
+
+    Coordinate in_ocoor(ndim);
+    in_grid->oCoorFromOindex(in_ocoor, in_oidx);
+
+    Coordinate lcoor(in_grid->Nd());
+      
+    for(int lane=0; lane < in_nsimd; lane++){
+      for(int mu=0;mu<ndim;mu++)
+	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
+
+      int lex;
+      Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions);
+      out_ptrs[lane] = &out[lex];
+    }
+    
+    //Unpack into those ptrs
+    const vobj & in_vobj = in._odata[in_oidx];
+    extract1(in_vobj, out_ptrs, 0);
+  }
+}
+
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
 template<typename vobj, typename sobj>
 typename std::enable_if<isSIMDvectorized<vobj>::value 
@@ -664,11 +712,62 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
   });
 }
 
+template<typename vobj, typename sobj>
+typename std::enable_if<isSIMDvectorized<vobj>::value 
+                    && !isSIMDvectorized<sobj>::value, void>::type 
+vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
+{
+
+  typedef typename vobj::vector_type vtype;
+  
+  GridBase* grid = out._grid;
+  assert(in.size()==grid->lSites());
+  
+  int ndim     = grid->Nd();
+  int nsimd    = vtype::Nsimd();
+
+  std::vector<Coordinate > icoor(nsimd);
+      
+  for(int lane=0; lane < nsimd; lane++){
+    icoor[lane].resize(ndim);
+    grid->iCoorFromIindex(icoor[lane],lane);
+  }
+  
+  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
+    //Assemble vector of pointers to output elements
+    std::vector<sobj*> ptrs(nsimd);
+
+    Coordinate ocoor(ndim);
+    grid->oCoorFromOindex(ocoor, oidx);
+
+    Coordinate lcoor(grid->Nd());
+      
+    for(int lane=0; lane < nsimd; lane++){
+
+      for(int mu=0;mu<ndim;mu++){
+	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
+      }
+
+      int lex;
+      Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions);
+      ptrs[lane] = &in[lex];
+    }
+    
+    //pack from those ptrs
+    vobj vecobj;
+    merge1(vecobj, ptrs, 0);
+    out._odata[oidx] = vecobj; 
+  }
+}
+
 //Convert a Lattice from one precision to another
 template<class VobjOut, class VobjIn>
-void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
-
+void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
+{
   assert(out.Grid()->Nd() == in.Grid()->Nd());
+  for(int d=0;d<out.Grid()->Nd();d++){
+    assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
+  }
   out.Checkerboard() = in.Checkerboard();
   GridBase *in_grid=in.Grid();
   GridBase *out_grid = out.Grid();
@@ -1006,4 +1105,4 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 }
 
 NAMESPACE_END(Grid);
-#endif
+
diff --git a/lib/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h
similarity index 100%
rename from lib/lattice/Lattice_transpose.h
rename to Grid/lattice/Lattice_transpose.h
diff --git a/lib/lattice/Lattice_unary.h b/Grid/lattice/Lattice_unary.h
similarity index 100%
rename from lib/lattice/Lattice_unary.h
rename to Grid/lattice/Lattice_unary.h
diff --git a/Grid/lattice/Lattice_where.h b/Grid/lattice/Lattice_where.h
new file mode 100644
index 00000000..6686d1b3
--- /dev/null
+++ b/Grid/lattice/Lattice_where.h
@@ -0,0 +1,86 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_where.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_WHERE_H
+#define GRID_LATTICE_WHERE_H
+namespace Grid {
+// Must implement the predicate gating the 
+// Must be able to reduce the predicate down to a single vInteger per site.
+// Must be able to require the type be iScalar x iScalar x ....
+//                              give a GetVtype method in iScalar
+//                              and blow away the tensor structures.
+//
+template<class vobj,class iobj>
+inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+  conformable(iftrue,ret);
+
+  GridBase *grid=iftrue._grid;
+
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename iobj::vector_type mask_type;
+
+  const int Nsimd = grid->Nsimd();
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals (Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);
+
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+
+    extract(iftrue._odata[ss]   ,truevals);
+    extract(iffalse._odata[ss]  ,falsevals);
+    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) falsevals[s]=truevals[s];
+    }
+
+    merge(ret._odata[ss],falsevals);
+  }
+}
+
+template<class vobj,class iobj>
+inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+
+  Lattice<vobj> ret(iftrue._grid);
+
+  where(ret,predicate,iftrue,iffalse);
+
+  return ret;
+}
+}
+#endif
diff --git a/lib/log/Log.cc b/Grid/log/Log.cc
similarity index 100%
rename from lib/log/Log.cc
rename to Grid/log/Log.cc
diff --git a/lib/log/Log.h b/Grid/log/Log.h
similarity index 90%
rename from lib/log/Log.h
rename to Grid/log/Log.h
index 6fd09124..81ecf464 100644
--- a/lib/log/Log.h
+++ b/Grid/log/Log.h
@@ -85,6 +85,7 @@ protected:
   Colours &Painter;
   int active;
   int timing_mode;
+  int topWidth{-1}, chanWidth{-1};
   static int timestamp;
   std::string name, topName;
   std::string COLOUR;
@@ -123,18 +124,32 @@ public:
       Reset(); 
     }
   }
+  void setTopWidth(const int w) {topWidth = w;}
+  void setChanWidth(const int w) {chanWidth = w;}
 
   friend std::ostream& operator<< (std::ostream& stream, Logger& log){
 
     if ( log.active ) {
-      stream << log.background()<<  std::left << log.topName << log.background()<< " : ";
-      stream << log.colour() <<  std::left << log.name << log.background() << " : ";
+      stream << log.background()<<  std::left;
+      if (log.topWidth > 0)
+      {
+        stream << std::setw(log.topWidth);
+      }
+      stream << log.topName << log.background()<< " : ";
+      stream << log.colour() <<  std::left;
+      if (log.chanWidth > 0)
+      {
+        stream << std::setw(log.chanWidth);
+      }
+      stream << log.name << log.background() << " : ";
       if ( log.timestamp ) {
 	log.StopWatch->Stop();
 	GridTime now = log.StopWatch->Elapsed();
+	
 	if ( log.timing_mode==1 ) log.StopWatch->Reset();
 	log.StopWatch->Start();
-	stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
+	stream << log.evidence()
+	       << now	       << log.background() << " : " ;
       }
       stream << log.colour();
       return stream;
diff --git a/Grid/parallelIO/BinaryIO.cc b/Grid/parallelIO/BinaryIO.cc
new file mode 100644
index 00000000..221a7fe8
--- /dev/null
+++ b/Grid/parallelIO/BinaryIO.cc
@@ -0,0 +1,3 @@
+#include <Grid/GridCore.h>
+
+int Grid::BinaryIO::latticeWriteMaxRetry = -1;
diff --git a/lib/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h
similarity index 81%
rename from lib/parallelIO/BinaryIO.h
rename to Grid/parallelIO/BinaryIO.h
index 6e3c95eb..a2de50c5 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -1,4 +1,4 @@
-/*************************************************************************************
+    /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -24,10 +24,9 @@
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_BINARY_IO_H
-#define GRID_BINARY_IO_H
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
 
 #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
@@ -42,14 +41,14 @@
 #include <arpa/inet.h>
 #include <algorithm>
 
-NAMESPACE_BEGIN(Grid); 
+NAMESPACE_BEGIN(Grid);
 
 /////////////////////////////////////////////////////////////////////////////////
 // Byte reversal garbage
 /////////////////////////////////////////////////////////////////////////////////
 inline uint32_t byte_reverse32(uint32_t f) { 
-  f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
-  return f;
+      f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
+      return f;
 }
 inline uint64_t byte_reverse64(uint64_t f) { 
   uint64_t g;
@@ -79,7 +78,8 @@ inline void removeWhitespace(std::string &key)
 // Could just use a namespace
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 class BinaryIO {
-public:
+ public:
+  static int latticeWriteMaxRetry;
 
   /////////////////////////////////////////////////////////////////////////////
   // more byte manipulation helpers
@@ -90,7 +90,7 @@ public:
     typedef typename vobj::scalar_object sobj;
 
     GridBase *grid = lat.Grid();
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
 
     std::vector<sobj> scalardata(lsites); 
     unvectorizeToLexOrdArray(scalardata,lat);    
@@ -104,22 +104,26 @@ public:
     const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
 
     uint64_t lsites = grid->lSites();
-    if (fbuf.size() == 1) {
+    if (fbuf.size() == 1)
+    {
       lsites = 1;
     }
 
-    thread_region {
-
+    thread_region
+    {
       uint32_t nersc_csum_thr = 0;
 
-      thread_loop( (uint64_t local_site = 0; local_site < lsites; local_site++), {
-	  uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
-	  for (uint64_t j = 0; j < size32; j++) {
-	    nersc_csum_thr = nersc_csum_thr + site_buf[j];
-	  }
+      thread_loop_in_region( (uint64_t local_site = 0; local_site < lsites; local_site++),
+      {
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
+        for (uint64_t j = 0; j < size32; j++)
+        {
+          nersc_csum_thr = nersc_csum_thr + site_buf[j];
+        }
       });
 
-      thread_critical {
+      thread_critical
+      {
         nersc_csum += nersc_csum_thr;
       }
     }
@@ -137,20 +141,22 @@ public:
     Coordinate local_start =grid->LocalStarts();
     Coordinate global_vol  =grid->FullDimensions();
 
-    thread_region { 
-
+    thread_region
+    { 
       Coordinate coor(nd);
       uint32_t scidac_csuma_thr=0;
       uint32_t scidac_csumb_thr=0;
       uint32_t site_crc=0;
 
-      thread_loop( (uint64_t local_site=0;local_site<lsites;local_site++),{
+      thread_loop_in_region( (uint64_t local_site=0;local_site<lsites;local_site++),{
 
 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 
 	/* 
 	 * Scidac csum  is rather more heavyweight
+	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
+	
 	int global_site;
 
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@@ -168,11 +174,11 @@ public:
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
 	//	std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
 	scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
-        scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
-
+	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
       });
 
-      thread_critical {
+      thread_critical
+      {
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;
       }
@@ -189,7 +195,7 @@ public:
   {
     uint32_t * f = (uint32_t *)file_object;
     uint64_t count = bytes/sizeof(uint32_t);
-    thread_loop( (uint64_t i=0;i<count;i++), {  
+    thread_loop( (uint64_t i=0;i<count;i++),{  
       f[i] = ntohl(f[i]);
     });
   }
@@ -200,7 +206,7 @@ public:
     uint32_t f;
 
     uint64_t count = bytes/sizeof(uint32_t);
-    thread_loop( (uint64_t i=0;i<count;i++), {  
+    thread_loop( (uint64_t i=0;i<count;i++),{  
       f = fp[i];
       // got network order and the network to host
       f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
@@ -225,7 +231,7 @@ public:
     uint64_t f,g;
     
     uint64_t count = bytes/sizeof(uint64_t);
-    thread_loop( (uint64_t i=0;i<count;i++), {  
+    thread_loop( (uint64_t i=0;i<count;i++),{  
       f = fp[i];
       // got network order and the network to host
       g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; 
@@ -251,7 +257,7 @@ public:
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
-			      Integer offset,
+			      uint64_t& offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
@@ -358,17 +364,17 @@ public:
 #endif
       } else {
 	std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
         std::ifstream fin;
-        fin.open(file, std::ios::binary | std::ios::in);
+	fin.open(file, std::ios::binary | std::ios::in);
         if (control & BINARYIO_MASTER_APPEND)
-	  {
-	    fin.seekg(-sizeof(fobj), fin.end);
-	  }
+        {
+          fin.seekg(-sizeof(fobj), fin.end);
+        }
         else
-	  {
-	    fin.seekg(offset + myrank * lsites * sizeof(fobj));
-	  }
+        {
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
+        }
         fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
         assert(fin.fail() == 0);
         fin.close();
@@ -407,26 +413,32 @@ public:
         ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
 	//        std::cout << GridLogMessage << "Checking for errors" << std::endl;
         if (ierr != MPI_SUCCESS)
-	  {
-	    char error_string[BUFSIZ];
-	    int length_of_error_string, error_class;
+        {
+          char error_string[BUFSIZ];
+          int length_of_error_string, error_class;
 
-	    MPI_Error_class(ierr, &error_class);
-	    MPI_Error_string(error_class, error_string, &length_of_error_string);
-	    fprintf(stderr, "%3d: %s\n", myrank, error_string);
-	    MPI_Error_string(ierr, error_string, &length_of_error_string);
-	    fprintf(stderr, "%3d: %s\n", myrank, error_string);
-	    MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
-	  }
+          MPI_Error_class(ierr, &error_class);
+          MPI_Error_string(error_class, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Error_string(ierr, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
+        }
 
-        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
+        std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
         ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
         assert(ierr == 0);
 
-        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
+        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
         ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
         assert(ierr == 0);
 
+        MPI_Offset os;
+        MPI_File_get_position(fh, &os);
+        MPI_File_get_byte_offset(fh, os, &disp);
+        offset = disp;
+
+
         MPI_File_close(&fh);
         MPI_Type_free(&fileArray);
         MPI_Type_free(&localArray);
@@ -436,16 +448,20 @@ public:
       } else { 
 
         std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
-                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+                  << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
         
 	std::ofstream fout; 
 	fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
 	try {
-	  fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+	  if (offset) { // Must already exist and contain data
+	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+	  } else {     // Allow create
+	    fout.open(file,std::ios::binary|std::ios::out);
+	  }
 	} catch (const std::fstream::failure& exc) {
 	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
-	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
+	  //	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 #ifdef USE_MPI_IO
 	  MPI_Abort(MPI_COMM_WORLD,1);
 #else
@@ -479,6 +495,7 @@ public:
 	  exit(1);
 #endif
 	}
+  offset  = fout.tellp();
 	fout.close();
       }
       timer.Stop();
@@ -513,7 +530,7 @@ public:
   static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
-				       Integer offset,
+				       uint64_t offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
@@ -523,7 +540,7 @@ public:
     typedef typename vobj::Realified::scalar_type word;    word w=0;
 
     GridBase *grid = Umu.Grid();
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
 
     std::vector<sobj> scalardata(lsites); 
     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -534,7 +551,7 @@ public:
     GridStopWatch timer; 
     timer.Start();
 
-    thread_loop( (int x=0;x<lsites;x++), { munge(iodata[x], scalardata[x]); });
+    thread_loop( (uint64_t x=0;x<lsites;x++), { munge(iodata[x], scalardata[x]); });
 
     vectorizeFromLexOrdArray(scalardata,Umu);    
     grid->Barrier();
@@ -547,19 +564,21 @@ public:
   // Write a Lattice of object
   //////////////////////////////////////////////////////////////////////////////////////
   template<class vobj,class fobj,class munger>
-  static inline void writeLatticeObject(Lattice<vobj> &Umu,
-					std::string file,
-					munger munge,
-					Integer offset,
-					const std::string &format,
-					uint32_t &nersc_csum,
-					uint32_t &scidac_csuma,
-					uint32_t &scidac_csumb)
+    static inline void writeLatticeObject(Lattice<vobj> &Umu,
+					  std::string file,
+					  munger munge,
+					  uint64_t offset,
+					  const std::string &format,
+					  uint32_t &nersc_csum,
+					  uint32_t &scidac_csuma,
+					  uint32_t &scidac_csumb)
   {
     typedef typename vobj::scalar_object sobj;
     typedef typename vobj::Realified::scalar_type word;    word w=0;
     GridBase *grid = Umu.Grid();
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites(), offsetCopy = offset;
+    int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
+    bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
 
     std::vector<sobj> scalardata(lsites); 
     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -570,13 +589,39 @@ public:
     GridStopWatch timer; timer.Start();
     unvectorizeToLexOrdArray(scalardata,Umu);    
 
-    thread_loop( (int x=0;x<lsites;x++), { munge(scalardata[x],iodata[x]);});
+    thread_loop( (uint64_t x=0;x<lsites;x++), { munge(scalardata[x],iodata[x]); });
 
     grid->Barrier();
     timer.Stop();
+    while (attemptsLeft >= 0)
+    {
+      grid->Barrier();
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+	             nersc_csum,scidac_csuma,scidac_csumb);
+      if (checkWrite)
+      {
+        std::vector<fobj> ckiodata(lsites);
+        uint32_t          cknersc_csum, ckscidac_csuma, ckscidac_csumb;
+        uint64_t          ckoffset = offsetCopy;
 
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
-	     nersc_csum,scidac_csuma,scidac_csumb);
+        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
+        grid->Barrier();
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
+        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
+          offset = offsetCopy;
+        }
+        else
+        {
+          std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
+          break;
+        }
+      }
+      attemptsLeft--;
+    }
+    
 
     std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed()  <<std::endl;
   }
@@ -587,7 +632,7 @@ public:
   static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
-			     Integer offset,
+			     uint64_t offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
@@ -600,8 +645,8 @@ public:
     std::string format = "IEEE32BIG";
 
     GridBase *grid = parallel.Grid();
-    int gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();
 
     uint32_t nersc_csum_tmp   = 0;
     uint32_t scidac_csuma_tmp = 0;
@@ -616,7 +661,7 @@ public:
 	     nersc_csum,scidac_csuma,scidac_csumb);
 
     timer.Start();
-    thread_loop( (int lidx=0;lidx<lsites;lidx++),{
+    thread_loop( (uint64_t lidx=0;lidx<lsites;lidx++),{
       std::vector<RngStateType> tmp(RngStateCount);
       std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
       parallel.SetState(tmp,lidx);
@@ -649,7 +694,7 @@ public:
   static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
@@ -660,8 +705,8 @@ public:
     typedef std::array<RngStateType,RngStateCount> RNGstate;
 
     GridBase *grid = parallel.Grid();
-    int gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();
 
     uint32_t nersc_csum_tmp;
     uint32_t scidac_csuma_tmp;
@@ -674,7 +719,7 @@ public:
 
     timer.Start();
     std::vector<RNGstate> iodata(lsites);
-    thread_loop( (int lidx=0;lidx<lsites;lidx++),{
+    thread_loop( (uint64_t lidx=0;lidx<lsites;lidx++),{
       std::vector<RngStateType> tmp(RngStateCount);
       parallel.GetState(tmp,lidx);
       std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
@@ -683,7 +728,6 @@ public:
 
     IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
 	     nersc_csum,scidac_csuma,scidac_csumb);
-
     iodata.resize(1);
     {
       std::vector<RngStateType> tmp(RngStateCount);
@@ -703,5 +747,5 @@ public:
     std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
   }
 };
+
 NAMESPACE_END(Grid);
-#endif
diff --git a/lib/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h
similarity index 65%
rename from lib/parallelIO/IldgIO.h
rename to Grid/parallelIO/IldgIO.h
index 2e12b366..d7b188b9 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@@ -23,9 +23,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-			   /*  END LEGAL */
-#ifndef GRID_ILDG_IO_H
-#define GRID_ILDG_IO_H
+/*  END LEGAL */
+#pragma once
 
 #ifdef HAVE_LIME
 #include <algorithm>
@@ -38,158 +37,163 @@ directory
 #include <sys/utsname.h>
 #include <unistd.h>
 
-			   //C-Lime is a must have for this functionality
+//C-Lime is a must have for this functionality
 extern "C" {  
 #include "lime.h"
 }
 
 NAMESPACE_BEGIN(Grid);
 
-/////////////////////////////////
-// Encode word types as strings
-/////////////////////////////////
-template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
-template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
-template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
-template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
-template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
-template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
-template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
+  /////////////////////////////////
+  // Encode word types as strings
+  /////////////////////////////////
+ template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
+ template<> inline std::string ScidacWordMnemonic<double>  (void){ return std::string("D"); }
+ template<> inline std::string ScidacWordMnemonic<float>   (void){ return std::string("F"); }
+ template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
+ template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
+ template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
+ template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
 
-/////////////////////////////////////////
-// Encode a generic tensor as a string
-/////////////////////////////////////////
-template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
+  /////////////////////////////////////////
+  // Encode a generic tensor as a string
+  /////////////////////////////////////////
+ template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
 
-  typedef typename getPrecision<vobj>::real_scalar_type stype;
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
 
-  int _ColourN       = indexRank<ColourIndex,vobj>();
-  int _ColourScalar  =  isScalar<ColourIndex,vobj>();
-  int _ColourVector  =  isVector<ColourIndex,vobj>();
-  int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
+   int _ColourN       = indexRank<ColourIndex,vobj>();
+   int _ColourScalar  =  isScalar<ColourIndex,vobj>();
+   int _ColourVector  =  isVector<ColourIndex,vobj>();
+   int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
 
-  int _SpinN       = indexRank<SpinIndex,vobj>();
-  int _SpinScalar  =  isScalar<SpinIndex,vobj>();
-  int _SpinVector  =  isVector<SpinIndex,vobj>();
-  int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
+   int _SpinN       = indexRank<SpinIndex,vobj>();
+   int _SpinScalar  =  isScalar<SpinIndex,vobj>();
+   int _SpinVector  =  isVector<SpinIndex,vobj>();
+   int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
 
-  int _LorentzN       = indexRank<LorentzIndex,vobj>();
-  int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
-  int _LorentzVector  =  isVector<LorentzIndex,vobj>();
-  int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
+   int _LorentzN       = indexRank<LorentzIndex,vobj>();
+   int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
+   int _LorentzVector  =  isVector<LorentzIndex,vobj>();
+   int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
 
-  std::stringstream stream;
+   std::stringstream stream;
 
-  stream << "GRID_";
-  stream << ScidacWordMnemonic<stype>();
+   stream << "GRID_";
+   stream << ScidacWordMnemonic<stype>();
 
-  if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
-  if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
+   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
+   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
 
-  if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
-  if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
+   if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
+   if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
 
-  if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
-  if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
+   if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
+   if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
 
-  if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
+   if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
 
 
-  typesize = sizeof(typename vobj::scalar_type);
+   typesize = sizeof(typename vobj::scalar_type);
 
-  if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
-  else                 typesize*= _ColourN;
+   if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
+   else                 typesize*= _ColourN;
 
-  if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
-  else                 typesize*= _SpinN;
+   if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
+   else                 typesize*= _SpinN;
 
-  colors    = _ColourN;
-  spins     = _SpinN;
-  datacount = _LorentzN;
+   colors    = _ColourN;
+   spins     = _SpinN;
+   datacount = _LorentzN;
 
-  return stream.str();
-}
+   return stream.str();
+ }
  
-template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
-  return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
-};
+ template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) { 
+   return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
+ };
 
 
-////////////////////////////////////////////////////////////
-// Helper to fill out metadata
-////////////////////////////////////////////////////////////
-template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
-					 FieldMetaData &header,
-					 scidacRecord & _scidacRecord,
-					 scidacFile   & _scidacFile) 
-{
-  typedef typename getPrecision<vobj>::real_scalar_type stype;
+ ////////////////////////////////////////////////////////////
+ // Helper to fill out metadata
+ ////////////////////////////////////////////////////////////
+ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+					  FieldMetaData &header,
+					  scidacRecord & _scidacRecord,
+					  scidacFile   & _scidacFile) 
+ {
+   typedef typename getPrecision<vobj>::real_scalar_type stype;
 
-  /////////////////////////////////////
-  // Pull Grid's metadata
-  /////////////////////////////////////
-  PrepareMetaData(field,header);
+   /////////////////////////////////////
+   // Pull Grid's metadata
+   /////////////////////////////////////
+   PrepareMetaData(field,header);
 
-  /////////////////////////////////////
-  // Scidac Private File structure
-  /////////////////////////////////////
-  _scidacFile              = scidacFile(field.Grid());
+   /////////////////////////////////////
+   // Scidac Private File structure
+   /////////////////////////////////////
+   _scidacFile              = scidacFile(field.Grid());
 
-  /////////////////////////////////////
-  // Scidac Private Record structure
-  /////////////////////////////////////
-  scidacRecord sr;
-  sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
-  sr.date       = header.creation_date;
-  sr.precision  = ScidacWordMnemonic<stype>();
-  sr.recordtype = GRID_IO_FIELD;
+   /////////////////////////////////////
+   // Scidac Private Record structure
+   /////////////////////////////////////
+   scidacRecord sr;
+   sr.datatype   = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
+   sr.date       = header.creation_date;
+   sr.precision  = ScidacWordMnemonic<stype>();
+   sr.recordtype = GRID_IO_FIELD;
 
-  _scidacRecord = sr;
+   _scidacRecord = sr;
 
-  //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
-}
+   //   std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
+ }
  
-///////////////////////////////////////////////////////
-// Scidac checksum
-///////////////////////////////////////////////////////
-static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
-{
-  uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
-  uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
-  if ( scidac_csuma !=scidac_checksuma) return 0;
-  if ( scidac_csumb !=scidac_checksumb) return 0;
-  return 1;
-}
+ ///////////////////////////////////////////////////////
+ // Scidac checksum
+ ///////////////////////////////////////////////////////
+ static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
+ {
+   uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
+   uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
+   if ( scidac_csuma !=scidac_checksuma) return 0;
+   if ( scidac_csumb !=scidac_checksumb) return 0;
+   return 1;
+ }
 
 ////////////////////////////////////////////////////////////////////////////////////
 // Lime, ILDG and Scidac I/O classes
 ////////////////////////////////////////////////////////////////////////////////////
 class GridLimeReader : public BinaryIO {
-public:
-  ///////////////////////////////////////////////////
-  // FIXME: format for RNG? Now just binary out instead
-  ///////////////////////////////////////////////////
+ public:
+   ///////////////////////////////////////////////////
+   // FIXME: format for RNG? Now just binary out instead
+   ///////////////////////////////////////////////////
 
-  FILE       *File;
-  LimeReader *LimeR;
-  std::string filename;
+   FILE       *File;
+   LimeReader *LimeR;
+   std::string filename;
 
-  /////////////////////////////////////////////
-  // Open the file
-  /////////////////////////////////////////////
-  void open(const std::string &_filename) 
-  {
-    filename= _filename;
-    File = fopen(filename.c_str(), "r");
-    LimeR = limeCreateReader(File);
-  }
-  /////////////////////////////////////////////
-  // Close the file
-  /////////////////////////////////////////////
-  void close(void){
-    fclose(File);
-    //     limeDestroyReader(LimeR);
-  }
+   /////////////////////////////////////////////
+   // Open the file
+   /////////////////////////////////////////////
+   void open(const std::string &_filename) 
+   {
+     filename= _filename;
+     File = fopen(filename.c_str(), "r");
+     if (File == nullptr)
+     {
+       std::cerr << "cannot open file '" << filename << "'" << std::endl;
+       abort();
+     }
+     LimeR = limeCreateReader(File);
+   }
+   /////////////////////////////////////////////
+   // Close the file
+   /////////////////////////////////////////////
+   void close(void){
+     fclose(File);
+     //     limeDestroyReader(LimeR);
+   }
 
   ////////////////////////////////////////////
   // Read a generic lattice field and verify checksum
@@ -227,7 +231,8 @@ public:
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
 	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-
+  std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
+  std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
 	// Insist checksum is next record
 	/////////////////////////////////////////////
@@ -244,10 +249,8 @@ public:
   ////////////////////////////////////////////
   // Read a generic serialisable object
   ////////////////////////////////////////////
-  template<class serialisable_object>
-  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  void readLimeObject(std::string &xmlstring,std::string record_name)
   {
-    std::string xmlstring;
     // should this be a do while; can we miss a first record??
     while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 
@@ -261,115 +264,178 @@ public:
 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
 
-	XmlReader RD(&xmlc[0],"");
-	read(RD,object_name,object);
+   xmlstring = std::string(&xmlc[0]);
 	return;
       }
 
     }  
     assert(0);
   }
+
+  template<class serialisable_object>
+  void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  {
+    std::string xmlstring;
+
+    readLimeObject(xmlstring, record_name);
+	  XmlReader RD(xmlstring, true, "");
+	  read(RD,object_name,object);
+  }
 };
 
-class GridLimeWriter : public BinaryIO {
-public:
-  ///////////////////////////////////////////////////
-  // FIXME: format for RNG? Now just binary out instead
-  // FIXME: collective calls or not ?
-  //      : must know if I am the I/O boss
-  ///////////////////////////////////////////////////
-  FILE       *File;
-  LimeWriter *LimeW;
-  std::string filename;
+class GridLimeWriter : public BinaryIO 
+{
+ public:
 
-  void open(const std::string &_filename) { 
-    filename= _filename;
-    File = fopen(filename.c_str(), "w");
-    LimeW = limeCreateWriter(File); assert(LimeW != NULL );
-  }
-  /////////////////////////////////////////////
-  // Close the file
-  /////////////////////////////////////////////
-  void close(void) {
-    fclose(File);
-    //  limeDestroyWriter(LimeW);
-  }
+   ///////////////////////////////////////////////////
+   // FIXME: format for RNG? Now just binary out instead
+   // FIXME: collective calls or not ?
+   //      : must know if I am the I/O boss
+   ///////////////////////////////////////////////////
+   FILE       *File;
+   LimeWriter *LimeW;
+   std::string filename;
+   bool        boss_node;
+   GridLimeWriter( bool isboss = true) {
+     boss_node = isboss;
+   }
+   void open(const std::string &_filename) { 
+     filename= _filename;
+     if ( boss_node ) {
+       File = fopen(filename.c_str(), "w");
+       LimeW = limeCreateWriter(File); assert(LimeW != NULL );
+     }
+   }
+   /////////////////////////////////////////////
+   // Close the file
+   /////////////////////////////////////////////
+   void close(void) {
+     if ( boss_node ) {
+       fclose(File);
+     }
+     //  limeDestroyWriter(LimeW);
+   }
   ///////////////////////////////////////////////////////
   // Lime utility functions
   ///////////////////////////////////////////////////////
   int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
   {
-    LimeRecordHeader *h;
-    h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
-    assert(limeWriteRecordHeader(h, LimeW) >= 0);
-    limeDestroyHeader(h);
+    if ( boss_node ) {
+      LimeRecordHeader *h;
+      h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
+      assert(limeWriteRecordHeader(h, LimeW) >= 0);
+      limeDestroyHeader(h);
+    }
     return LIME_SUCCESS;
   }
   ////////////////////////////////////////////
   // Write a generic serialisable object
   ////////////////////////////////////////////
-  template<class serialisable_object>
-  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
+  void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
   {
-    std::string xmlstring;
-    {
-      XmlWriter WR("","");
-      write(WR,object_name,object);
-      xmlstring = WR.XmlString();
-    }
-    //    std::cout << "WriteLimeObject" << record_name <<std::endl;
-    uint64_t nbytes = xmlstring.size();
-    //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
-    int err;
-    LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
-    assert(h!= NULL);
+    if ( boss_node ) {
+      std::string xmlstring = writer.docString();
 
-    err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
-    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
-    err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
-    limeDestroyHeader(h);
-    //    std::cout << " File offset is now"<<ftello(File) << std::endl;
+      //    std::cout << "WriteLimeObject" << record_name <<std::endl;
+      uint64_t nbytes = xmlstring.size();
+      //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
+      int err;
+      LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
+      assert(h!= NULL);
+      
+      err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
+      err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
+      err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
+      limeDestroyHeader(h);
+    }
   }
-  ////////////////////////////////////////////
+
+  template<class serialisable_object>
+  void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
+  {
+    XmlWriter WR("","");
+
+    if (scientificPrec)
+    {
+      WR.scientificFormat(true);
+      WR.setPrecision(scientificPrec);
+    }
+    write(WR,object_name,object);
+    writeLimeObject(MB, ME, WR, object_name, record_name);
+  }
+  ////////////////////////////////////////////////////
   // Write a generic lattice field and csum
-  ////////////////////////////////////////////
+  // This routine is Collectively called by all nodes
+  // in communicator used by the field.Grid()
+  ////////////////////////////////////////////////////
   template<class vobj>
   void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
   {
-    ////////////////////////////////////////////
-    // Create record header
-    ////////////////////////////////////////////
-    typedef typename vobj::scalar_object sobj;
-    int err;
-    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
-    uint64_t PayloadSize = sizeof(sobj) * field.Grid()->_gsites;
-    createLimeRecordHeader(record_name, 0, 0, PayloadSize);
-
-    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
-    //    std::cout << "W Gsites "           <<field.Grid()->_gsites<<std::endl;
-    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
-
     ////////////////////////////////////////////////////////////////////
     // NB: FILE and iostream are jointly writing disjoint sequences in the
     // the same file through different file handles (integer units).
     // 
     // These are both buffered, so why I think this code is right is as follows.
     //
-    // i)  write record header to FILE *File, telegraphing the size. 
-    // ii) ftello reads the offset from FILE *File .
+    // i)  write record header to FILE *File, telegraphing the size; flush
+    // ii) ftello reads the offset from FILE *File . 
     // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
     //      Closes iostream and flushes.
     // iv) fseek on FILE * to end of this disjoint section.
     //  v) Continue writing scidac record.
     ////////////////////////////////////////////////////////////////////
-    uint64_t offset = ftello(File);
-    //    std::cout << " Writing to offset "<<offset << std::endl;
+    
+    GridBase *grid = field.Grid();
+    assert(boss_node == field.Grid()->IsBoss() );
+
+    ////////////////////////////////////////////
+    // Create record header
+    ////////////////////////////////////////////
+    typedef typename vobj::scalar_object sobj;
+    int err;
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
+    if ( boss_node ) {
+      createLimeRecordHeader(record_name, 0, 0, PayloadSize);
+      fflush(File);
+    }
+    
+    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
+    //    std::cout << "W Gsites "           <<field.Grid()->_gsites<<std::endl;
+    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
+
+    ////////////////////////////////////////////////
+    // Check all nodes agree on file position
+    ////////////////////////////////////////////////
+    uint64_t offset1;
+    if ( boss_node ) {
+      offset1 = ftello(File);    
+    }
+    grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
+
+    ///////////////////////////////////////////
+    // The above is collective. Write by other means into the binary record
+    ///////////////////////////////////////////
     std::string format = getFormatString<vobj>();
     BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-    //    fseek(File,0,SEEK_END);    offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
-    err=limeWriterCloseRecord(LimeW);  assert(err>=0);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
 
+    ///////////////////////////////////////////
+    // Wind forward and close the record
+    ///////////////////////////////////////////
+    if ( boss_node ) {
+      fseek(File,0,SEEK_END);             
+      uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
+      assert( (offset2-offset1) == PayloadSize);
+    }
+
+    /////////////////////////////////////////////////////////////
+    // Check MPI-2 I/O did what we expect to file
+    /////////////////////////////////////////////////////////////
+
+    if ( boss_node ) { 
+      err=limeWriterCloseRecord(LimeW);  assert(err>=0);
+    }
     ////////////////////////////////////////
     // Write checksum element, propagaing forward from the BinaryIO
     // Always pair a checksum with a binary object, and close message
@@ -379,26 +445,32 @@ public:
     std::stringstream streamb; streamb << std::hex << scidac_csumb;
     checksum.suma= streama.str();
     checksum.sumb= streamb.str();
-    //    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
-    writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
+    if ( boss_node ) { 
+      writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
+    }
   }
 };
 
 class ScidacWriter : public GridLimeWriter {
-public:
+ public:
+
+  ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss)  { };
 
   template<class SerialisableUserFile>
   void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
   {
     scidacFile    _scidacFile(grid);
-    writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
-    writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+    if ( this->boss_node ) {
+      writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+      writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+    }
   }
   ////////////////////////////////////////////////
   // Write generic lattice field in scidac format
   ////////////////////////////////////////////////
   template <class vobj, class userRecord>
-  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord) 
+  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
+                              const unsigned int recordScientificPrec = 0) 
   {
     GridBase * grid = field.Grid();
 
@@ -414,24 +486,27 @@ public:
     //////////////////////////////////////////////
     // Fill the Lime file record by record
     //////////////////////////////////////////////
-    writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
-    writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
-    writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    if ( this->boss_node ) {
+      writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
+      writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
+      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
+    }
+    // Collective call
     writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
   }
 };
 
 
 class ScidacReader : public GridLimeReader {
-public:
+ public:
 
-  template<class SerialisableUserFile>
-  void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
-  {
-    scidacFile    _scidacFile(grid);
-    readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
-    readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
-  }
+   template<class SerialisableUserFile>
+   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
+   {
+     scidacFile    _scidacFile(grid);
+     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
+     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
+   }
   ////////////////////////////////////////////////
   // Write generic lattice field in scidac format
   ////////////////////////////////////////////////
@@ -482,7 +557,9 @@ public:
 
 
 class IldgWriter : public ScidacWriter {
-public:
+ public:
+  
+  IldgWriter(bool isboss) : ScidacWriter(isboss) {};
 
   ///////////////////////////////////
   // A little helper
@@ -525,7 +602,7 @@ public:
     header.ildg_lfn = LFN;
 
     assert ( (format == std::string("IEEE32BIG"))  
-	     ||(format == std::string("IEEE64BIG")) );
+           ||(format == std::string("IEEE64BIG")) );
 
     //////////////////////////////////////////////////////
     // Fill ILDG header data struct
@@ -567,12 +644,11 @@ public:
     writeLimeIldgLFN(header.ildg_lfn);                                                 // rec
     writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
     //    limeDestroyWriter(LimeW);
-    fclose(File);
   }
 };
 
 class IldgReader : public GridLimeReader {
-public:
+ public:
 
   ////////////////////////////////////////////////////////////////
   // Read either Grid/SciDAC/ILDG configuration
@@ -593,7 +669,7 @@ public:
 
     GridBase *grid = Umu.Grid();
 
-    auto dims = Umu.Grid()->FullDimensions();
+    Coordinate dims = Umu.Grid()->FullDimensions();
 
     assert(dims.size()==4);
 
@@ -643,9 +719,11 @@ public:
 
 	//////////////////////////////////
 	// ILDG format record
+
+  std::string xmlstring(&xmlc[0]);
 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) { 
 
-	  XmlReader RD(&xmlc[0],"");
+	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"ildgFormat",ildgFormat_);
 
 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
@@ -660,13 +738,13 @@ public:
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
-	  FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
+	  FieldMetaData_.ildg_lfn = xmlstring;
 	  found_ildgLFN = 1;
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) { 
 
-	  XmlReader RD(&xmlc[0],"");
+	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"FieldMetaData",FieldMetaData_);
 
 	  format = FieldMetaData_.floating_point;
@@ -680,18 +758,17 @@ public:
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) { 
-	  std::string xmls(&xmlc[0]);
 	  // is it a USQCD info field
-	  if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) { 
+	  if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) { 
 	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
-	    XmlReader RD(&xmlc[0],"");
+	    XmlReader RD(xmlstring, true, "");
 	    read(RD,"usqcdInfo",usqcdInfo_);
 	    found_usqcdInfo = 1;
 	  }
 	}
 
 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) { 
-	  XmlReader RD(&xmlc[0],"");
+	  XmlReader RD(xmlstring, true, "");
 	  read(RD,"scidacChecksum",scidacChecksum_);
 	  found_scidacChecksum = 1;
 	}
@@ -722,7 +799,6 @@ public:
     assert(found_ildgBinary);
     assert(found_ildgFormat);
     assert(found_scidacChecksum);
-    assert(found_ildgLFN==0);
 
     // Must find something with the lattice dimensions
     assert(found_FieldMetaData||found_ildgFormat);
@@ -788,11 +864,11 @@ public:
       std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
     }
   }
-};
+ };
 
 NAMESPACE_END(Grid);
 
+
 //HAVE_LIME
 #endif
 
-#endif
diff --git a/lib/parallelIO/IldgIOtypes.h b/Grid/parallelIO/IldgIOtypes.h
similarity index 98%
rename from lib/parallelIO/IldgIOtypes.h
rename to Grid/parallelIO/IldgIOtypes.h
index 87b50f6b..ddc0969c 100644
--- a/lib/parallelIO/IldgIOtypes.h
+++ b/Grid/parallelIO/IldgIOtypes.h
@@ -136,8 +136,9 @@ public:
 				  int, typesize,
 				  int, datacount);
 
-  scidacRecord() { version =1.0; }
-
+  scidacRecord()
+  : version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
+  {}
 };
 
 ////////////////////////
diff --git a/lib/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h
similarity index 95%
rename from lib/parallelIO/MetaData.h
rename to Grid/parallelIO/MetaData.h
index 84296108..88a600e5 100644
--- a/lib/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -81,22 +81,19 @@ public:
 				  std::string, creation_date,
 				  std::string, archive_date,
 				  std::string, floating_point);
-  FieldMetaData(void) { 
-    nd=4;
-    dimension.resize(4);
-    boundary.resize(4);
-    scidac_checksuma=0;
-    scidac_checksumb=0;
-    checksum=0;
-  }
+      // WARNING: non-initialised values might lead to twisted parallel IO
+      // issues, std::string are fine because they initliase to size 0
+      // as per C++ standard.
+      FieldMetaData(void) 
+      : nd(4), dimension(4,0), boundary(4, ""), data_start(0),
+      link_trace(0.), plaquette(0.), checksum(0),
+      scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
+      {}
 };
 
-
-
-
-
-using namespace Grid;
-
+// PB disable using namespace - this is a header and forces namesapce visibility for all 
+// including files
+//using namespace Grid;
 
 //////////////////////////////////////////////////////////////////////
 // Bit and Physical Checksumming and QA of data
diff --git a/lib/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h
similarity index 96%
rename from lib/parallelIO/NerscIO.h
rename to Grid/parallelIO/NerscIO.h
index 503d53f8..7f6adaa5 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -56,6 +56,7 @@ public:
   // for the header-reader
   static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
   {
+    uint64_t offset=0;
     std::map<std::string,std::string> header;
     std::string line;
 
@@ -137,7 +138,7 @@ public:
     typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 
     GridBase *grid = Umu.Grid();
-    int offset = readHeader(file,grid,header);
+    uint64_t offset = readHeader(file,Umu.Grid(),header);
 
     FieldMetaData clone(header);
 
@@ -232,21 +233,25 @@ public:
     GaugeStatistics(Umu,header);
     MachineCharacteristics(header);
 
-    int offset;
-  
-    truncate(file);
+	uint64_t offset;
 
     // Sod it -- always write 3x3 double
     header.floating_point = std::string("IEEE64BIG");
     header.data_type      = std::string("4D_SU3_GAUGE_3x3");
     GaugeSimpleUnmunger<fobj3D,sobj> munge;
+	if ( grid->IsBoss() ) { 
+	  truncate(file);
     offset = writeHeader(header,file);
+	}
+	grid->Broadcast(0,(void *)&offset,sizeof(offset));
 
     uint32_t nersc_csum,scidac_csuma,scidac_csumb;
     BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
 					      nersc_csum,scidac_csuma,scidac_csumb);
     header.checksum = nersc_csum;
+	if ( grid->IsBoss() ) { 
     writeHeader(header,file);
+	}
 
     std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
 	     <<std::hex<<header.checksum
@@ -274,7 +279,7 @@ public:
     header.plaquette=0.0;
     MachineCharacteristics(header);
 
-    int offset;
+	uint64_t offset;
   
 #ifdef RNG_RANLUX
     header.floating_point = std::string("UINT64");
@@ -289,12 +294,18 @@ public:
     header.data_type      = std::string("SITMO");
 #endif
 
+	if ( grid->IsBoss() ) { 
     truncate(file);
     offset = writeHeader(header,file);
+	}
+	grid->Broadcast(0,(void *)&offset,sizeof(offset));
+	
     uint32_t nersc_csum,scidac_csuma,scidac_csumb;
     BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
     header.checksum = nersc_csum;
+	if ( grid->IsBoss() ) { 
     offset = writeHeader(header,file);
+	}
 
     std::cout<<GridLogMessage 
 	     <<"Written NERSC RNG STATE "<<file<< " checksum "
@@ -309,7 +320,7 @@ public:
 
     GridBase *grid = parallel.Grid();
 
-    int offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);
 
     FieldMetaData clone(header);
 
diff --git a/lib/perfmon/PerfCount.cc b/Grid/perfmon/PerfCount.cc
similarity index 100%
rename from lib/perfmon/PerfCount.cc
rename to Grid/perfmon/PerfCount.cc
diff --git a/lib/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h
similarity index 100%
rename from lib/perfmon/PerfCount.h
rename to Grid/perfmon/PerfCount.h
diff --git a/lib/perfmon/Stat.cc b/Grid/perfmon/Stat.cc
similarity index 100%
rename from lib/perfmon/Stat.cc
rename to Grid/perfmon/Stat.cc
diff --git a/lib/perfmon/Stat.h b/Grid/perfmon/Stat.h
similarity index 100%
rename from lib/perfmon/Stat.h
rename to Grid/perfmon/Stat.h
diff --git a/lib/perfmon/Timer.h b/Grid/perfmon/Timer.h
similarity index 72%
rename from lib/perfmon/Timer.h
rename to Grid/perfmon/Timer.h
index 5b23fcd4..88b4e1cc 100644
--- a/lib/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -47,15 +47,39 @@ inline double usecond(void) {
 
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
-typedef  std::chrono::milliseconds          GridTime;
-typedef  std::chrono::microseconds          GridUsecs;
 
-inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
+typedef  std::chrono::seconds               GridSecs;
+typedef  std::chrono::milliseconds          GridMillisecs;
+typedef  std::chrono::microseconds          GridUsecs;
+typedef  std::chrono::microseconds          GridTime;
+
+inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
 {
-  stream << time.count()<<" ms";
+  stream << time.count()<<" s";
+  return stream;
+}
+inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
+{
+  GridSecs second(1);
+  auto     secs       = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
+  return stream;
+}
+inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
+{
+  GridSecs second(1);
+  auto     seconds    = now/second ; 
+  auto     subseconds = now%second ;
+  auto     fill       = stream.fill();
+  stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
+  stream.fill(fill);
   return stream;
 }
  
+
 class GridStopWatch {
 private:
   bool running;
@@ -94,6 +118,9 @@ public:
     assert(running == false);
     return (uint64_t) accumulator.count();
   }
+  bool isRunning(void){
+    return running;
+  }
 };
 
 NAMESPACE_END(Grid)
diff --git a/lib/pugixml/pugiconfig.hpp b/Grid/pugixml/pugiconfig.hpp
similarity index 92%
rename from lib/pugixml/pugiconfig.hpp
rename to Grid/pugixml/pugiconfig.hpp
index 5ee5131f..f739e062 100644
--- a/lib/pugixml/pugiconfig.hpp
+++ b/Grid/pugixml/pugiconfig.hpp
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.6
+ * pugixml parser - version 1.9
  * --------------------------------------------------------
- * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at http://pugixml.org/
  *
  * This library is distributed under the MIT License. See notice at the end
@@ -17,6 +17,9 @@
 // Uncomment this to enable wchar_t mode
 // #define PUGIXML_WCHAR_MODE
 
+// Uncomment this to enable compact mode
+// #define PUGIXML_COMPACT
+
 // Uncomment this to disable XPath
 // #define PUGIXML_NO_XPATH
 
@@ -46,7 +49,7 @@
 #endif
 
 /**
- * Copyright (c) 2006-2015 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
@@ -59,7 +62,7 @@
  *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
diff --git a/lib/pugixml/pugixml.cc b/Grid/pugixml/pugixml.cc
similarity index 92%
rename from lib/pugixml/pugixml.cc
rename to Grid/pugixml/pugixml.cc
index 1f83e38f..dd08092c 100644
--- a/lib/pugixml/pugixml.cc
+++ b/Grid/pugixml/pugixml.cc
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.8
+ * pugixml parser - version 1.9
  * --------------------------------------------------------
- * Copyright (C) 2006-2016, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at http://pugixml.org/
  *
  * This library is distributed under the MIT License. See notice at the end
@@ -14,7 +14,7 @@
 #ifndef SOURCE_PUGIXML_CPP
 #define SOURCE_PUGIXML_CPP
 
-#include "pugixml.h"
+#include <Grid/pugixml/pugixml.h>
 
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,9 +29,6 @@
 #ifndef PUGIXML_NO_XPATH
 #	include <math.h>
 #	include <float.h>
-#	ifdef PUGIXML_NO_EXCEPTIONS
-#		include <setjmp.h>
-#	endif
 #endif
 
 #ifndef PUGIXML_NO_STL
@@ -47,10 +44,13 @@
 #	pragma warning(push)
 #	pragma warning(disable: 4127) // conditional expression is constant
 #	pragma warning(disable: 4324) // structure was padded due to __declspec(align())
-#	pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
 #	pragma warning(disable: 4702) // unreachable code
 #	pragma warning(disable: 4996) // this function or variable may be unsafe
-#	pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#if defined(_MSC_VER) && defined(__c2__)
+#	pragma clang diagnostic push
+#	pragma clang diagnostic ignored "-Wdeprecated" // this function or variable may be unsafe
 #endif
 
 #ifdef __INTEL_COMPILER
@@ -60,11 +60,6 @@
 #	pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
 #endif
 
-#ifdef __NVCC__
-#	pragma warning(disable: 177) // function was declared but never referenced
-#       pragma diag_suppress declared_but_not_referenced
-#endif
-
 #if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
 #	pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
 #endif
@@ -81,6 +76,10 @@
 #	pragma diag_suppress=237 // controlling expression is constant
 #endif
 
+#ifdef __TI_COMPILER_VERSION__
+#	pragma diag_suppress 179 // function was declared but never referenced
+#endif
+
 // Inlining controls
 #if defined(_MSC_VER) && _MSC_VER >= 1300
 #	define PUGI__NO_INLINE __declspec(noinline)
@@ -91,7 +90,7 @@
 #endif
 
 // Branch weight controls
-#if defined(__GNUC__)
+#if defined(__GNUC__) && !defined(__c2__)
 #	define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0)
 #else
 #	define PUGI__UNLIKELY(cond) (cond)
@@ -107,6 +106,17 @@
 #	define PUGI__DMC_VOLATILE
 #endif
 
+// Integer sanitizer workaround; we only apply this for clang since gcc8 has no_sanitize but not unsigned-integer-overflow and produces "attribute directive ignored" warnings
+#if defined(__clang__) && defined(__has_attribute)
+#	if __has_attribute(no_sanitize)
+#		define PUGI__UNSIGNED_OVERFLOW __attribute__((no_sanitize("unsigned-integer-overflow")))
+#	else
+#		define PUGI__UNSIGNED_OVERFLOW
+#	endif
+#else
+#	define PUGI__UNSIGNED_OVERFLOW
+#endif
+
 // Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
 #if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
 using std::memcpy;
@@ -114,11 +124,11 @@ using std::memmove;
 using std::memset;
 #endif
 
-// Some MinGW versions have headers that erroneously omit LLONG_MIN/LLONG_MAX/ULLONG_MAX definitions in strict ANSI mode
-#if defined(PUGIXML_HAS_LONG_LONG) && defined(__MINGW32__) && defined(__STRICT_ANSI__) && !defined(LLONG_MAX) && !defined(LLONG_MIN) && !defined(ULLONG_MAX)
-#	define LLONG_MAX 9223372036854775807LL
-#	define LLONG_MIN (-LLONG_MAX-1)
-#	define ULLONG_MAX (2ULL*LLONG_MAX+1)
+// Some MinGW/GCC versions have headers that erroneously omit LLONG_MIN/LLONG_MAX/ULLONG_MAX definitions from limits.h in some configurations
+#if defined(PUGIXML_HAS_LONG_LONG) && defined(__GNUC__) && !defined(LLONG_MAX) && !defined(LLONG_MIN) && !defined(ULLONG_MAX)
+#	define LLONG_MIN (-LLONG_MAX - 1LL)
+#	define LLONG_MAX __LONG_LONG_MAX__
+#	define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL)
 #endif
 
 // In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
@@ -126,6 +136,16 @@ using std::memset;
 #	define PUGI__MSVC_CRT_VERSION _MSC_VER
 #endif
 
+// Not all platforms have snprintf; we define a wrapper that uses snprintf if possible. This only works with buffers with a known size.
+#if __cplusplus >= 201103
+#	define PUGI__SNPRINTF(buf, ...) snprintf(buf, sizeof(buf), __VA_ARGS__)
+#elif defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400
+#	define PUGI__SNPRINTF(buf, ...) _snprintf_s(buf, _countof(buf), _TRUNCATE, __VA_ARGS__)
+#else
+#	define PUGI__SNPRINTF sprintf
+#endif
+
+// We put implementation details into an anonymous namespace in source mode, but have to keep it in non-anonymous namespace in header-only mode to prevent binary bloat.
 #ifdef PUGIXML_HEADER_ONLY
 #	define PUGI__NS_BEGIN namespace pugi { namespace impl {
 #	define PUGI__NS_END } }
@@ -174,19 +194,15 @@ PUGI__NS_BEGIN
 	template <typename T>
 	struct xml_memory_management_function_storage
 	{
-	  static allocation_function allocate    ;
-	  static deallocation_function deallocate;
+		static allocation_function allocate;
+		static deallocation_function deallocate;
 	};
 
 	// Global allocation functions are stored in class statics so that in header mode linker deduplicates them
 	// Without a template<> we'll get multiple definitions of the same static
-        template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+	template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
 	template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
 
-//        allocation_function xml_memory_management_function_storage<int>::allocate = default_allocate;
-//        deallocation_function xml_memory_management_function_storage<int>::deallocate = default_deallocate;
-
-        template struct xml_memory_management_function_storage<int>;
 	typedef xml_memory_management_function_storage<int> xml_memory;
 PUGI__NS_END
 
@@ -288,67 +304,37 @@ PUGI__NS_BEGIN
 			}
 		}
 
-		void** find(const void* key)
+		void* find(const void* key)
 		{
-			assert(key);
-
 			if (_capacity == 0) return 0;
 
-			size_t hashmod = _capacity - 1;
-			size_t bucket = hash(key) & hashmod;
+			item_t* item = get_item(key);
+			assert(item);
+			assert(item->key == key || (item->key == 0 && item->value == 0));
 
-			for (size_t probe = 0; probe <= hashmod; ++probe)
-			{
-				item_t& probe_item = _items[bucket];
-
-				if (probe_item.key == key)
-					return &probe_item.value;
-
-				if (probe_item.key == 0)
-					return 0;
-
-				// hash collision, quadratic probing
-				bucket = (bucket + probe + 1) & hashmod;
-			}
-
-			assert(false && "Hash table is full");
-			return 0;
+			return item->value;
 		}
 
-		void** insert(const void* key)
+		void insert(const void* key, void* value)
 		{
-			assert(key);
 			assert(_capacity != 0 && _count < _capacity - _capacity / 4);
 
-			size_t hashmod = _capacity - 1;
-			size_t bucket = hash(key) & hashmod;
+			item_t* item = get_item(key);
+			assert(item);
 
-			for (size_t probe = 0; probe <= hashmod; ++probe)
+			if (item->key == 0)
 			{
-				item_t& probe_item = _items[bucket];
-
-				if (probe_item.key == 0)
-				{
-					probe_item.key = key;
-					_count++;
-					return &probe_item.value;
-				}
-
-				if (probe_item.key == key)
-					return &probe_item.value;
-
-				// hash collision, quadratic probing
-				bucket = (bucket + probe + 1) & hashmod;
+				_count++;
+				item->key = key;
 			}
 
-			assert(false && "Hash table is full");
-			return 0;
+			item->value = value;
 		}
 
-		bool reserve()
+		bool reserve(size_t extra = 16)
 		{
-			if (_count + 16 >= _capacity - _capacity / 4)
-				return rehash();
+			if (_count + extra >= _capacity - _capacity / 4)
+				return rehash(_count + extra);
 
 			return true;
 		}
@@ -365,9 +351,32 @@ PUGI__NS_BEGIN
 
 		size_t _count;
 
-		bool rehash();
+		bool rehash(size_t count);
 
-		static unsigned int hash(const void* key)
+		item_t* get_item(const void* key)
+		{
+			assert(key);
+			assert(_capacity > 0);
+
+			size_t hashmod = _capacity - 1;
+			size_t bucket = hash(key) & hashmod;
+
+			for (size_t probe = 0; probe <= hashmod; ++probe)
+			{
+				item_t& probe_item = _items[bucket];
+
+				if (probe_item.key == key || probe_item.key == 0)
+					return &probe_item;
+
+				// hash collision, quadratic probing
+				bucket = (bucket + probe + 1) & hashmod;
+			}
+
+			assert(false && "Hash table is full"); // unreachable
+			return 0;
+		}
+
+		static PUGI__UNSIGNED_OVERFLOW unsigned int hash(const void* key)
 		{
 			unsigned int h = static_cast<unsigned int>(reinterpret_cast<uintptr_t>(key));
 
@@ -382,25 +391,29 @@ PUGI__NS_BEGIN
 		}
 	};
 
-	PUGI__FN_NO_INLINE bool compact_hash_table::rehash()
+	PUGI__FN_NO_INLINE bool compact_hash_table::rehash(size_t count)
 	{
+		size_t capacity = 32;
+		while (count >= capacity - capacity / 4)
+			capacity *= 2;
+
 		compact_hash_table rt;
-		rt._capacity = (_capacity == 0) ? 32 : _capacity * 2;
-		rt._items = static_cast<item_t*>(xml_memory::allocate(sizeof(item_t) * rt._capacity));
+		rt._capacity = capacity;
+		rt._items = static_cast<item_t*>(xml_memory::allocate(sizeof(item_t) * capacity));
 
 		if (!rt._items)
 			return false;
 
-		memset(rt._items, 0, sizeof(item_t) * rt._capacity);
+		memset(rt._items, 0, sizeof(item_t) * capacity);
 
 		for (size_t i = 0; i < _capacity; ++i)
 			if (_items[i].key)
-				*rt.insert(_items[i].key) = _items[i].value;
+				rt.insert(_items[i].key, _items[i].value);
 
 		if (_items)
 			xml_memory::deallocate(_items);
 
-		_capacity = rt._capacity;
+		_capacity = capacity;
 		_items = rt._items;
 
 		assert(_count == rt._count);
@@ -787,12 +800,12 @@ PUGI__NS_BEGIN
 
 	template <int header_offset, typename T> PUGI__FN_NO_INLINE T* compact_get_value(const void* object)
 	{
-		return static_cast<T*>(*compact_get_page(object, header_offset)->allocator->_hash->find(object));
+		return static_cast<T*>(compact_get_page(object, header_offset)->allocator->_hash->find(object));
 	}
 
 	template <int header_offset, typename T> PUGI__FN_NO_INLINE void compact_set_value(const void* object, T* value)
 	{
-		*compact_get_page(object, header_offset)->allocator->_hash->insert(object) = value;
+		compact_get_page(object, header_offset)->allocator->_hash->insert(object, value);
 	}
 
 	template <typename T, int header_offset, int start = -126> class compact_pointer
@@ -839,7 +852,7 @@ PUGI__NS_BEGIN
 				{
 					uintptr_t base = reinterpret_cast<uintptr_t>(this) & ~(compact_alignment - 1);
 
-					return reinterpret_cast<T*>(base + ((_data - 1 + start) << compact_alignment_log2));
+					return reinterpret_cast<T*>(base + (_data - 1 + start) * compact_alignment);
 				}
 				else
 					return compact_get_value<header_offset, T>(this);
@@ -917,7 +930,7 @@ PUGI__NS_BEGIN
 				{
 					uintptr_t base = reinterpret_cast<uintptr_t>(this) & ~(compact_alignment - 1);
 
-					return reinterpret_cast<T*>(base + ((_data - 1 - 65533) << compact_alignment_log2));
+					return reinterpret_cast<T*>(base + (_data - 1 - 65533) * compact_alignment);
 				}
 				else if (_data == 65534)
 					return static_cast<T*>(compact_get_page(this, header_offset)->compact_shared_parent);
@@ -2154,7 +2167,7 @@ PUGI__NS_BEGIN
 		if (encoding == encoding_latin1)
 			return convert_buffer_generic(out_buffer, out_length, contents, size, latin1_decoder());
 
-		assert(false && "Invalid encoding");
+		assert(false && "Invalid encoding"); // unreachable
 		return false;
 	}
 #else
@@ -2259,7 +2272,7 @@ PUGI__NS_BEGIN
 		if (encoding == encoding_latin1)
 			return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
 
-		assert(false && "Invalid encoding");
+		assert(false && "Invalid encoding"); // unreachable
 		return false;
 	}
 #endif
@@ -2472,7 +2485,7 @@ PUGI__NS_BEGIN
 
 					for (;;)
 					{
-						if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
+						if (static_cast<unsigned int>(ch - '0') <= 9)
 							ucsc = 10 * ucsc + (ch - '0');
 						else if (ch == ';')
 							break;
@@ -2706,7 +2719,7 @@ PUGI__NS_BEGIN
 		case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
 		case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
 		case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
-		default: assert(false); return 0; // should not get here
+		default: assert(false); return 0; // unreachable
 		}
 	}
 
@@ -2883,7 +2896,7 @@ PUGI__NS_BEGIN
 		case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
 		case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
 		case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
-		default: assert(false); return 0; // should not get here
+		default: assert(false); return 0; // unreachable
 		}
 	}
 
@@ -3632,7 +3645,7 @@ PUGI__NS_BEGIN
 		if (encoding == encoding_latin1)
 			return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), latin1_writer());
 
-		assert(false && "Invalid encoding");
+		assert(false && "Invalid encoding"); // unreachable
 		return 0;
 	}
 #else
@@ -3671,7 +3684,7 @@ PUGI__NS_BEGIN
 		if (encoding == encoding_latin1)
 			return convert_buffer_output_generic(r_u8, data, length, utf8_decoder(), latin1_writer());
 
-		assert(false && "Invalid encoding");
+		assert(false && "Invalid encoding"); // unreachable
 		return 0;
 	}
 #endif
@@ -4198,7 +4211,7 @@ PUGI__NS_BEGIN
 				break;
 
 			default:
-				assert(false && "Invalid node type");
+				assert(false && "Invalid node type"); // unreachable
 		}
 	}
 
@@ -4410,6 +4423,7 @@ PUGI__NS_BEGIN
 
 		while (sit && sit != sn)
 		{
+			// when a tree is copied into one of the descendants, we need to skip that subtree to avoid an infinite loop
 			if (sit != dn)
 			{
 				xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit));
@@ -4460,7 +4474,7 @@ PUGI__NS_BEGIN
 	}
 
 	// get value with conversion functions
-	template <typename U> U string_to_integer(const char_t* value, U minneg, U maxpos)
+	template <typename U> PUGI__FN PUGI__UNSIGNED_OVERFLOW U string_to_integer(const char_t* value, U minv, U maxv)
 	{
 		U result = 0;
 		const char_t* s = value;
@@ -4530,14 +4544,21 @@ PUGI__NS_BEGIN
 		}
 
 		if (negative)
-			return (overflow || result > minneg) ? 0 - minneg : 0 - result;
+		{
+			// Workaround for crayc++ CC-3059: Expected no overflow in routine.
+		#ifdef _CRAYC
+			return (overflow || result > ~minv + 1) ? minv : ~result + 1;
+		#else
+			return (overflow || result > 0 - minv) ? minv : 0 - result;
+		#endif
+		}
 		else
-			return (overflow || result > maxpos) ? maxpos : result;
+			return (overflow || result > maxv) ? maxv : result;
 	}
 
 	PUGI__FN int get_value_int(const char_t* value)
 	{
-		return string_to_integer<unsigned int>(value, 0 - static_cast<unsigned int>(INT_MIN), INT_MAX);
+		return string_to_integer<unsigned int>(value, static_cast<unsigned int>(INT_MIN), INT_MAX);
 	}
 
 	PUGI__FN unsigned int get_value_uint(const char_t* value)
@@ -4575,7 +4596,7 @@ PUGI__NS_BEGIN
 #ifdef PUGIXML_HAS_LONG_LONG
 	PUGI__FN long long get_value_llong(const char_t* value)
 	{
-		return string_to_integer<unsigned long long>(value, 0 - static_cast<unsigned long long>(LLONG_MIN), LLONG_MAX);
+		return string_to_integer<unsigned long long>(value, static_cast<unsigned long long>(LLONG_MIN), LLONG_MAX);
 	}
 
 	PUGI__FN unsigned long long get_value_ullong(const char_t* value)
@@ -4584,7 +4605,7 @@ PUGI__NS_BEGIN
 	}
 #endif
 
-	template <typename U> PUGI__FN char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative)
+	template <typename U> PUGI__FN PUGI__UNSIGNED_OVERFLOW char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative)
 	{
 		char_t* result = end - 1;
 		U rest = negative ? 0 - value : value;
@@ -4635,7 +4656,7 @@ PUGI__NS_BEGIN
 	PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, float value)
 	{
 		char buf[128];
-		sprintf(buf, "%.9g", value);
+		PUGI__SNPRINTF(buf, "%.9g", value);
 
 		return set_value_ascii(dest, header, header_mask, buf);
 	}
@@ -4644,7 +4665,7 @@ PUGI__NS_BEGIN
 	PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, double value)
 	{
 		char buf[128];
-		sprintf(buf, "%.17g", value);
+		PUGI__SNPRINTF(buf, "%.17g", value);
 
 		return set_value_ascii(dest, header, header_mask, buf);
 	}
@@ -6056,11 +6077,17 @@ namespace pugi
 
 		// get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later)
 		impl::xml_memory_page* page = 0;
-		impl::xml_extra_buffer* extra = static_cast<impl::xml_extra_buffer*>(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page));
+		impl::xml_extra_buffer* extra = static_cast<impl::xml_extra_buffer*>(doc->allocate_memory(sizeof(impl::xml_extra_buffer) + sizeof(void*), page));
 		(void)page;
 
 		if (!extra) return impl::make_parse_result(status_out_of_memory);
 
+	#ifdef PUGIXML_COMPACT
+		// align the memory block to a pointer boundary; this is required for compact mode where memory allocations are only 4b aligned
+		// note that this requires up to sizeof(void*)-1 additional memory, which the allocation above takes into account
+		extra = reinterpret_cast<impl::xml_extra_buffer*>((reinterpret_cast<uintptr_t>(extra) + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1));
+	#endif
+
 		// add extra buffer to the list
 		extra->buffer = 0;
 		extra->next = doc->extra_buffers;
@@ -6120,7 +6147,7 @@ namespace pugi
 			if (j != _root)
 				result[--offset] = delimiter;
 
-			if (j->name && *j->name)
+			if (j->name)
 			{
 				size_t length = impl::strlength(j->name);
 
@@ -6139,7 +6166,7 @@ namespace pugi
 	{
 		xml_node found = *this; // Current search context.
 
-		if (!_root || !path_ || !path_[0]) return found;
+		if (!_root || !path_[0]) return found;
 
 		if (path_[0] == delimiter)
 		{
@@ -6186,10 +6213,10 @@ namespace pugi
 	{
 		walker._depth = -1;
 
-		xml_node arg_begin = *this;
+		xml_node arg_begin(_root);
 		if (!walker.begin(arg_begin)) return false;
 
-		xml_node cur = first_child();
+		xml_node_struct* cur = _root ? _root->first_child + 0 : 0;
 
 		if (cur)
 		{
@@ -6197,36 +6224,35 @@ namespace pugi
 
 			do
 			{
-				xml_node arg_for_each = cur;
+				xml_node arg_for_each(cur);
 				if (!walker.for_each(arg_for_each))
 					return false;
 
-				if (cur.first_child())
+				if (cur->first_child)
 				{
 					++walker._depth;
-					cur = cur.first_child();
+					cur = cur->first_child;
 				}
-				else if (cur.next_sibling())
-					cur = cur.next_sibling();
+				else if (cur->next_sibling)
+					cur = cur->next_sibling;
 				else
 				{
-					// Borland C++ workaround
-					while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+					while (!cur->next_sibling && cur != _root && cur->parent)
 					{
 						--walker._depth;
-						cur = cur.parent();
+						cur = cur->parent;
 					}
 
-					if (cur != *this)
-						cur = cur.next_sibling();
+					if (cur != _root)
+						cur = cur->next_sibling;
 				}
 			}
-			while (cur && cur != *this);
+			while (cur && cur != _root);
 		}
 
 		assert(walker._depth == -1);
 
-		xml_node arg_end = *this;
+		xml_node arg_end(_root);
 		return walker.end(arg_end);
 	}
 
@@ -6293,6 +6319,7 @@ namespace pugi
 			return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1;
 
 		default:
+			assert(false && "Invalid node type"); // unreachable
 			return -1;
 		}
 	}
@@ -6817,6 +6844,25 @@ namespace pugi
 		_destroy();
 	}
 
+#ifdef PUGIXML_HAS_MOVE
+	PUGI__FN xml_document::xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT: _buffer(0)
+	{
+		_create();
+		_move(rhs);
+	}
+
+	PUGI__FN xml_document& xml_document::operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT
+	{
+		if (this == &rhs) return *this;
+
+		_destroy();
+		_create();
+		_move(rhs);
+
+		return *this;
+	}
+#endif
+
 	PUGI__FN void xml_document::reset()
 	{
 		_destroy();
@@ -6836,7 +6882,8 @@ namespace pugi
 		assert(!_root);
 
 	#ifdef PUGIXML_COMPACT
-		const size_t page_offset = sizeof(uint32_t);
+		// space for page marker for the first page (uint32_t), rounded up to pointer size; assumes pointers are at least 32-bit
+		const size_t page_offset = sizeof(void*);
 	#else
 		const size_t page_offset = 0;
 	#endif
@@ -6912,6 +6959,113 @@ namespace pugi
 		_root = 0;
 	}
 
+#ifdef PUGIXML_HAS_MOVE
+	PUGI__FN void xml_document::_move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT
+	{
+		impl::xml_document_struct* doc = static_cast<impl::xml_document_struct*>(_root);
+		impl::xml_document_struct* other = static_cast<impl::xml_document_struct*>(rhs._root);
+
+		// save first child pointer for later; this needs hash access
+		xml_node_struct* other_first_child = other->first_child;
+
+	#ifdef PUGIXML_COMPACT
+		// reserve space for the hash table up front; this is the only operation that can fail
+		// if it does, we have no choice but to throw (if we have exceptions)
+		if (other_first_child)
+		{
+			size_t other_children = 0;
+			for (xml_node_struct* node = other_first_child; node; node = node->next_sibling)
+				other_children++;
+
+			// in compact mode, each pointer assignment could result in a hash table request
+			// during move, we have to relocate document first_child and parents of all children
+			// normally there's just one child and its parent has a pointerless encoding but
+			// we assume the worst here
+			if (!other->_hash->reserve(other_children + 1))
+			{
+			#ifdef PUGIXML_NO_EXCEPTIONS
+				return;
+			#else
+				throw std::bad_alloc();
+			#endif
+			}
+		}
+	#endif
+
+		// move allocation state
+		doc->_root = other->_root;
+		doc->_busy_size = other->_busy_size;
+
+		// move buffer state
+		doc->buffer = other->buffer;
+		doc->extra_buffers = other->extra_buffers;
+		_buffer = rhs._buffer;
+
+	#ifdef PUGIXML_COMPACT
+		// move compact hash; note that the hash table can have pointers to other but they will be "inactive", similarly to nodes removed with remove_child
+		doc->hash = other->hash;
+		doc->_hash = &doc->hash;
+
+		// make sure we don't access other hash up until the end when we reinitialize other document
+		other->_hash = 0;
+	#endif
+
+		// move page structure
+		impl::xml_memory_page* doc_page = PUGI__GETPAGE(doc);
+		assert(doc_page && !doc_page->prev && !doc_page->next);
+
+		impl::xml_memory_page* other_page = PUGI__GETPAGE(other);
+		assert(other_page && !other_page->prev);
+
+		// relink pages since root page is embedded into xml_document
+		if (impl::xml_memory_page* page = other_page->next)
+		{
+			assert(page->prev == other_page);
+
+			page->prev = doc_page;
+
+			doc_page->next = page;
+			other_page->next = 0;
+		}
+
+		// make sure pages point to the correct document state
+		for (impl::xml_memory_page* page = doc_page->next; page; page = page->next)
+		{
+			assert(page->allocator == other);
+
+			page->allocator = doc;
+
+		#ifdef PUGIXML_COMPACT
+			// this automatically migrates most children between documents and prevents ->parent assignment from allocating
+			if (page->compact_shared_parent == other)
+				page->compact_shared_parent = doc;
+		#endif
+		}
+
+		// move tree structure
+		assert(!doc->first_child);
+
+		doc->first_child = other_first_child;
+
+		for (xml_node_struct* node = other_first_child; node; node = node->next_sibling)
+		{
+		#ifdef PUGIXML_COMPACT
+			// most children will have migrated when we reassigned compact_shared_parent
+			assert(node->parent == other || node->parent == doc);
+
+			node->parent = doc;
+		#else
+			assert(node->parent == other);
+			node->parent = doc;
+		#endif
+		}
+
+		// reset other document
+		new (other) impl::xml_document_struct(PUGI__GETPAGE(other));
+		rhs._buffer = 0;
+	}
+#endif
+
 #ifndef PUGIXML_NO_STL
 	PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
 	{
@@ -7223,134 +7377,76 @@ PUGI__NS_BEGIN
 		return write + 1;
 	}
 
-	template <typename I> void copy_backwards(I begin, I end, I target)
+	template <typename T, typename Pred> void insertion_sort(T* begin, T* end, const Pred& pred)
 	{
-		while (begin != end) *--target = *--end;
-	}
+		if (begin == end)
+			return;
 
-	template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
-	{
-		assert(begin != end);
-
-		for (I it = begin + 1; it != end; ++it)
+		for (T* it = begin + 1; it != end; ++it)
 		{
 			T val = *it;
+			T* hole = it;
 
-			if (pred(val, *begin))
+			// move hole backwards
+			while (hole > begin && pred(val, *(hole - 1)))
 			{
-				// move to front
-				copy_backwards(begin, it, it + 1);
-				*begin = val;
+				*hole = *(hole - 1);
+				hole--;
 			}
+
+			// fill hole with element
+			*hole = val;
+		}
+	}
+
+	template <typename I, typename Pred> I median3(I first, I middle, I last, const Pred& pred)
+	{
+		if (pred(*middle, *first)) swap(middle, first);
+		if (pred(*last, *middle)) swap(last, middle);
+		if (pred(*middle, *first)) swap(middle, first);
+
+		return middle;
+	}
+
+	template <typename T, typename Pred> void partition3(T* begin, T* end, T pivot, const Pred& pred, T** out_eqbeg, T** out_eqend)
+	{
+		// invariant: array is split into 4 groups: = < ? > (each variable denotes the boundary between the groups)
+		T* eq = begin;
+		T* lt = begin;
+		T* gt = end;
+
+		while (lt < gt)
+		{
+			if (pred(*lt, pivot))
+				lt++;
+			else if (*lt == pivot)
+				swap(*eq++, *lt++);
 			else
-			{
-				I hole = it;
-
-				// move hole backwards
-				while (pred(val, *(hole - 1)))
-				{
-					*hole = *(hole - 1);
-					hole--;
-				}
-
-				// fill hole with element
-				*hole = val;
-			}
+				swap(*lt, *--gt);
 		}
-	}
 
-	// std variant for elements with ==
-	template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
-	{
-		I eqbeg = middle, eqend = middle + 1;
+		// we now have just 4 groups: = < >; move equal elements to the middle
+		T* eqbeg = gt;
 
-		// expand equal range
-		while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
-		while (eqend != end && *eqend == *eqbeg) ++eqend;
+		for (T* it = begin; it != eq; ++it)
+			swap(*it, *--eqbeg);
 
-		// process outer elements
-		I ltend = eqbeg, gtbeg = eqend;
-
-		for (;;)
-		{
-			// find the element from the right side that belongs to the left one
-			for (; gtbeg != end; ++gtbeg)
-				if (!pred(*eqbeg, *gtbeg))
-				{
-					if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
-					else break;
-				}
-
-			// find the element from the left side that belongs to the right one
-			for (; ltend != begin; --ltend)
-				if (!pred(*(ltend - 1), *eqbeg))
-				{
-					if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
-					else break;
-				}
-
-			// scanned all elements
-			if (gtbeg == end && ltend == begin)
-			{
-				*out_eqbeg = eqbeg;
-				*out_eqend = eqend;
-				return;
-			}
-
-			// make room for elements by moving equal area
-			if (gtbeg == end)
-			{
-				if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
-				swap(*eqbeg, *--eqend);
-			}
-			else if (ltend == begin)
-			{
-				if (eqend != gtbeg) swap(*eqbeg, *eqend);
-				++eqend;
-				swap(*gtbeg++, *eqbeg++);
-			}
-			else swap(*gtbeg++, *--ltend);
-		}
-	}
-
-	template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
-	{
-		if (pred(*middle, *first)) swap(*middle, *first);
-		if (pred(*last, *middle)) swap(*last, *middle);
-		if (pred(*middle, *first)) swap(*middle, *first);
-	}
-
-	template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
-	{
-		if (last - first <= 40)
-		{
-			// median of three for small chunks
-			median3(first, middle, last, pred);
-		}
-		else
-		{
-			// median of nine
-			size_t step = (last - first + 1) / 8;
-
-			median3(first, first + step, first + 2 * step, pred);
-			median3(middle - step, middle, middle + step, pred);
-			median3(last - 2 * step, last - step, last, pred);
-			median3(first + step, middle, last - step, pred);
-		}
+		*out_eqbeg = eqbeg;
+		*out_eqend = gt;
 	}
 
 	template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
 	{
 		// sort large chunks
-		while (end - begin > 32)
+		while (end - begin > 16)
 		{
 			// find median element
 			I middle = begin + (end - begin) / 2;
-			median(begin, middle, end - 1, pred);
+			I median = median3(begin, middle, end - 1, pred);
 
 			// partition in three chunks (< = >)
 			I eqbeg, eqend;
-			partition(begin, middle, end, pred, &eqbeg, &eqend);
+			partition3(begin, end, *median, pred, &eqbeg, &eqend);
 
 			// loop on larger half
 			if (eqbeg - begin > end - eqend)
@@ -7366,7 +7462,7 @@ PUGI__NS_BEGIN
 		}
 
 		// insertion sort small chunk
-		if (begin != end) insertion_sort(begin, end, pred, &*begin);
+		insertion_sort(begin, end, pred);
 	}
 PUGI__NS_END
 
@@ -7394,24 +7490,17 @@ PUGI__NS_BEGIN
 		};
 	};
 
-	class xpath_allocator
+	struct xpath_allocator
 	{
 		xpath_memory_block* _root;
 		size_t _root_size;
+		bool* _error;
 
-	public:
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		jmp_buf* error_handler;
-	#endif
-
-		xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+		xpath_allocator(xpath_memory_block* root, bool* error = 0): _root(root), _root_size(0), _error(error)
 		{
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			error_handler = 0;
-		#endif
 		}
 
-		void* allocate_nothrow(size_t size)
+		void* allocate(size_t size)
 		{
 			// round size up to block alignment boundary
 			size = (size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1);
@@ -7432,7 +7521,11 @@ PUGI__NS_BEGIN
 				size_t block_size = block_capacity + offsetof(xpath_memory_block, data);
 
 				xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
-				if (!block) return 0;
+				if (!block)
+				{
+					if (_error) *_error = true;
+					return 0;
+				}
 
 				block->next = _root;
 				block->capacity = block_capacity;
@@ -7444,23 +7537,6 @@ PUGI__NS_BEGIN
 			}
 		}
 
-		void* allocate(size_t size)
-		{
-			void* result = allocate_nothrow(size);
-
-			if (!result)
-			{
-			#ifdef PUGIXML_NO_EXCEPTIONS
-				assert(error_handler);
-				longjmp(*error_handler, 1);
-			#else
-				throw std::bad_alloc();
-			#endif
-			}
-
-			return result;
-		}
-
 		void* reallocate(void* ptr, size_t old_size, size_t new_size)
 		{
 			// round size up to block alignment boundary
@@ -7470,33 +7546,35 @@ PUGI__NS_BEGIN
 			// we can only reallocate the last object
 			assert(ptr == 0 || static_cast<char*>(ptr) + old_size == &_root->data[0] + _root_size);
 
-			// adjust root size so that we have not allocated the object at all
-			bool only_object = (_root_size == old_size);
+			// try to reallocate the object inplace
+			if (ptr && _root_size - old_size + new_size <= _root->capacity)
+			{
+				_root_size = _root_size - old_size + new_size;
+				return ptr;
+			}
 
-			if (ptr) _root_size -= old_size;
-
-			// allocate a new version (this will obviously reuse the memory if possible)
+			// allocate a new block
 			void* result = allocate(new_size);
-			assert(result);
+			if (!result) return 0;
 
 			// we have a new block
-			if (result != ptr && ptr)
+			if (ptr)
 			{
-				// copy old data
+				// copy old data (we only support growing)
 				assert(new_size >= old_size);
 				memcpy(result, ptr, old_size);
 
 				// free the previous page if it had no other objects
-				if (only_object)
-				{
-					assert(_root->data == result);
-					assert(_root->next);
+				assert(_root->data == result);
+				assert(_root->next);
 
+				if (_root->next->data == ptr)
+				{
+					// deallocate the whole page, unless it was the first one
 					xpath_memory_block* next = _root->next->next;
 
 					if (next)
 					{
-						// deallocate the whole page, unless it was the first one
 						xml_memory::deallocate(_root->next);
 						_root->next = next;
 					}
@@ -7568,22 +7646,15 @@ PUGI__NS_BEGIN
 		xpath_allocator result;
 		xpath_allocator temp;
 		xpath_stack stack;
+		bool oom;
 
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		jmp_buf error_handler;
-	#endif
-
-		xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+		xpath_stack_data(): result(blocks + 0, &oom), temp(blocks + 1, &oom), oom(false)
 		{
 			blocks[0].next = blocks[1].next = 0;
 			blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data);
 
 			stack.result = &result;
 			stack.temp = &temp;
-
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			result.error_handler = temp.error_handler = &error_handler;
-		#endif
 		}
 
 		~xpath_stack_data()
@@ -7605,7 +7676,7 @@ PUGI__NS_BEGIN
 		static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
 		{
 			char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
-			assert(result);
+			if (!result) return 0;
 
 			memcpy(result, string, length * sizeof(char_t));
 			result[length] = 0;
@@ -7634,9 +7705,13 @@ PUGI__NS_BEGIN
 		{
 			assert(begin <= end);
 
-			size_t length = static_cast<size_t>(end - begin);
+			if (begin == end)
+				return xpath_string();
 
-			return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length);
+			size_t length = static_cast<size_t>(end - begin);
+			const char_t* data = duplicate_string(begin, length, alloc);
+
+			return data ? xpath_string(data, true, length) : xpath_string();
 		}
 
 		xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0)
@@ -7662,7 +7737,7 @@ PUGI__NS_BEGIN
 
 				// allocate new buffer
 				char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
-				assert(result);
+				if (!result) return;
 
 				// append first string to the new buffer in case there was no reallocation
 				if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
@@ -7694,8 +7769,11 @@ PUGI__NS_BEGIN
 			if (!_uses_heap)
 			{
 				size_t length_ = strlength(_buffer);
+				const char_t* data_ = duplicate_string(_buffer, length_, alloc);
 
-				_buffer = duplicate_string(_buffer, length_, alloc);
+				if (!data_) return 0;
+
+				_buffer = data_;
 				_uses_heap = true;
 				_length_heap = length_;
 			}
@@ -8058,11 +8136,11 @@ PUGI__NS_BEGIN
 
 	// gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
 #if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
-	PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+	PUGI__FN void convert_number_to_mantissa_exponent(double value, char (&buffer)[32], char** out_mantissa, int* out_exponent)
 	{
 		// get base values
 		int sign, exponent;
-		_ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+		_ecvt_s(buffer, sizeof(buffer), value, DBL_DIG + 1, &exponent, &sign);
 
 		// truncate redundant zeros
 		truncate_zeros(buffer, buffer + strlen(buffer));
@@ -8072,12 +8150,10 @@ PUGI__NS_BEGIN
 		*out_exponent = exponent;
 	}
 #else
-	PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+	PUGI__FN void convert_number_to_mantissa_exponent(double value, char (&buffer)[32], char** out_mantissa, int* out_exponent)
 	{
 		// get a scientific notation value with IEEE DBL_DIG decimals
-		sprintf(buffer, "%.*e", DBL_DIG, value);
-		assert(strlen(buffer) < buffer_size);
-		(void)!buffer_size;
+		PUGI__SNPRINTF(buffer, "%.*e", DBL_DIG, value);
 
 		// get the exponent (possibly negative)
 		char* exponent_string = strchr(buffer, 'e');
@@ -8114,12 +8190,12 @@ PUGI__NS_BEGIN
 
 		char* mantissa;
 		int exponent;
-		convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+		convert_number_to_mantissa_exponent(value, mantissa_buffer, &mantissa, &exponent);
 
 		// allocate a buffer of suitable length for the number
 		size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4;
 		char_t* result = static_cast<char_t*>(alloc->allocate(sizeof(char_t) * result_size));
-		assert(result);
+		if (!result) return xpath_string();
 
 		// make the number!
 		char_t* s = result;
@@ -8136,7 +8212,7 @@ PUGI__NS_BEGIN
 		{
 			while (exponent > 0)
 			{
-				assert(*mantissa == 0 || static_cast<unsigned int>(static_cast<unsigned int>(*mantissa) - '0') <= 9);
+				assert(*mantissa == 0 || static_cast<unsigned int>(*mantissa - '0') <= 9);
 				*s++ = *mantissa ? *mantissa++ : '0';
 				exponent--;
 			}
@@ -8403,12 +8479,10 @@ PUGI__NS_BEGIN
 			if (!table[i])
 				table[i] = static_cast<unsigned char>(i);
 
-		void* result = alloc->allocate_nothrow(sizeof(table));
+		void* result = alloc->allocate(sizeof(table));
+		if (!result) return 0;
 
-		if (result)
-		{
-			memcpy(result, table, sizeof(table));
-		}
+		memcpy(result, table, sizeof(table));
 
 		return static_cast<unsigned char*>(result);
 	}
@@ -8495,7 +8569,7 @@ PUGI__NS_BEGIN
 
 	static const xpath_node_set dummy_node_set;
 
-	PUGI__FN unsigned int hash_string(const char_t* str)
+	PUGI__FN PUGI__UNSIGNED_OVERFLOW unsigned int hash_string(const char_t* str)
 	{
 		// Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
 		unsigned int result = 0;
@@ -8578,7 +8652,7 @@ PUGI__NS_BEGIN
 			break;
 
 		default:
-			assert(false && "Invalid variable type");
+			assert(false && "Invalid variable type"); // unreachable
 		}
 	}
 
@@ -8599,7 +8673,7 @@ PUGI__NS_BEGIN
 			return lhs->set(static_cast<const xpath_variable_boolean*>(rhs)->value);
 
 		default:
-			assert(false && "Invalid variable type");
+			assert(false && "Invalid variable type"); // unreachable
 			return false;
 		}
 	}
@@ -8686,7 +8760,7 @@ PUGI__NS_BEGIN
 			return *min_element(begin, end, document_order_comparator());
 
 		default:
-			assert(false && "Invalid node set type");
+			assert(false && "Invalid node set type"); // unreachable
 			return xpath_node();
 		}
 	}
@@ -8751,7 +8825,7 @@ PUGI__NS_BEGIN
 			{
 				// reallocate the old array or allocate a new one
 				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
-				assert(data);
+				if (!data) return;
 
 				// finalize
 				_begin = data;
@@ -8803,7 +8877,7 @@ PUGI__NS_BEGIN
 
 		// reallocate the old array or allocate a new one
 		xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
-		assert(data);
+		if (!data) return;
 
 		// finalize
 		_begin = data;
@@ -9416,7 +9490,7 @@ PUGI__NS_BEGIN
 				}
 			}
 
-			assert(false && "Wrong types");
+			assert(false && "Wrong types"); // unreachable
 			return false;
 		}
 
@@ -9491,7 +9565,7 @@ PUGI__NS_BEGIN
 			}
 			else
 			{
-				assert(false && "Wrong types");
+				assert(false && "Wrong types"); // unreachable
 				return false;
 			}
 		}
@@ -9709,7 +9783,7 @@ PUGI__NS_BEGIN
 				break;
 
 			default:
-				assert(false && "Unknown axis");
+				assert(false && "Unknown axis"); // unreachable
 			}
 
 			return false;
@@ -9904,7 +9978,7 @@ PUGI__NS_BEGIN
 			}
 
 			default:
-				assert(false && "Unimplemented axis");
+				assert(false && "Unimplemented axis"); // unreachable
 			}
 		}
 
@@ -9985,7 +10059,7 @@ PUGI__NS_BEGIN
 			}
 
 			default:
-				assert(false && "Unimplemented axis");
+				assert(false && "Unimplemented axis"); // unreachable
 			}
 		}
 
@@ -10201,10 +10275,9 @@ PUGI__NS_BEGIN
 
 				if (_rettype == xpath_type_boolean)
 					return _data.variable->get_boolean();
-
-				// fallthrough to type conversion
 			}
 
+			// fallthrough
 			default:
 			{
 				switch (_rettype)
@@ -10227,7 +10300,7 @@ PUGI__NS_BEGIN
 				}
 
 				default:
-					assert(false && "Wrong expression for return type boolean");
+					assert(false && "Wrong expression for return type boolean"); // unreachable
 					return false;
 				}
 			}
@@ -10337,10 +10410,9 @@ PUGI__NS_BEGIN
 
 				if (_rettype == xpath_type_number)
 					return _data.variable->get_number();
-
-				// fallthrough to type conversion
 			}
 
+			// fallthrough
 			default:
 			{
 				switch (_rettype)
@@ -10363,7 +10435,7 @@ PUGI__NS_BEGIN
 				}
 
 				default:
-					assert(false && "Wrong expression for return type number");
+					assert(false && "Wrong expression for return type number"); // unreachable
 					return 0;
 				}
 
@@ -10381,16 +10453,9 @@ PUGI__NS_BEGIN
 			size_t count = 1;
 			for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
 
-			// gather all strings
-			xpath_string static_buffer[4];
-			xpath_string* buffer = static_buffer;
-
-			// allocate on-heap for large concats
-			if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
-			{
-				buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
-				assert(buffer);
-			}
+			// allocate a buffer for temporary string objects
+			xpath_string* buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+			if (!buffer) return xpath_string();
 
 			// evaluate all strings to temporary stack
 			xpath_stack swapped_stack = {stack.temp, stack.result};
@@ -10407,7 +10472,7 @@ PUGI__NS_BEGIN
 
 			// create final string
 			char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
-			assert(result);
+			if (!result) return xpath_string();
 
 			char_t* ri = result;
 
@@ -10574,6 +10639,8 @@ PUGI__NS_BEGIN
 				xpath_string s = string_value(c.n, stack.result);
 
 				char_t* begin = s.data(stack.result);
+				if (!begin) return xpath_string();
+
 				char_t* end = normalize_space(begin);
 
 				return xpath_string::from_heap_preallocated(begin, end);
@@ -10584,6 +10651,8 @@ PUGI__NS_BEGIN
 				xpath_string s = _left->eval_string(c, stack);
 
 				char_t* begin = s.data(stack.result);
+				if (!begin) return xpath_string();
+
 				char_t* end = normalize_space(begin);
 
 				return xpath_string::from_heap_preallocated(begin, end);
@@ -10600,6 +10669,8 @@ PUGI__NS_BEGIN
 				xpath_string to = _right->_next->eval_string(c, swapped_stack);
 
 				char_t* begin = s.data(stack.result);
+				if (!begin) return xpath_string();
+
 				char_t* end = translate(begin, from.c_str(), to.c_str(), to.length());
 
 				return xpath_string::from_heap_preallocated(begin, end);
@@ -10610,6 +10681,8 @@ PUGI__NS_BEGIN
 				xpath_string s = _left->eval_string(c, stack);
 
 				char_t* begin = s.data(stack.result);
+				if (!begin) return xpath_string();
+
 				char_t* end = translate_table(begin, _data.table);
 
 				return xpath_string::from_heap_preallocated(begin, end);
@@ -10621,10 +10694,9 @@ PUGI__NS_BEGIN
 
 				if (_rettype == xpath_type_string)
 					return xpath_string::from_const(_data.variable->get_string());
-
-				// fallthrough to type conversion
 			}
 
+			// fallthrough
 			default:
 			{
 				switch (_rettype)
@@ -10646,7 +10718,7 @@ PUGI__NS_BEGIN
 				}
 
 				default:
-					assert(false && "Wrong expression for return type string");
+					assert(false && "Wrong expression for return type string"); // unreachable
 					return xpath_string();
 				}
 			}
@@ -10737,7 +10809,7 @@ PUGI__NS_BEGIN
 					return step_do(c, stack, eval, axis_to_type<axis_self>());
 
 				default:
-					assert(false && "Unknown axis");
+					assert(false && "Unknown axis"); // unreachable
 					return xpath_node_set_raw();
 				}
 			}
@@ -10771,12 +10843,11 @@ PUGI__NS_BEGIN
 
 					return ns;
 				}
-
-				// fallthrough to type conversion
 			}
 
+			// fallthrough
 			default:
-				assert(false && "Wrong expression for return type node set");
+				assert(false && "Wrong expression for return type node set"); // unreachable
 				return xpath_node_set_raw();
 			}
 		}
@@ -10919,66 +10990,77 @@ PUGI__NS_BEGIN
 
 		char_t _scratch[32];
 
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		jmp_buf _error_handler;
-	#endif
-
-		void throw_error(const char* message)
+		xpath_ast_node* error(const char* message)
 		{
 			_result->error = message;
 			_result->offset = _lexer.current_pos() - _query;
 
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			longjmp(_error_handler, 1);
-		#else
-			throw xpath_exception(*_result);
-		#endif
+			return 0;
 		}
 
-		void throw_error_oom()
+		xpath_ast_node* error_oom()
 		{
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			throw_error("Out of memory");
-		#else
-			throw std::bad_alloc();
-		#endif
+			assert(_alloc->_error);
+			*_alloc->_error = true;
+
+			return 0;
 		}
 
 		void* alloc_node()
 		{
-			void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+			return _alloc->allocate(sizeof(xpath_ast_node));
+		}
 
-			if (!result) throw_error_oom();
+		xpath_ast_node* alloc_node(ast_type_t type, xpath_value_type rettype, const char_t* value)
+		{
+			void* memory = alloc_node();
+			return memory ? new (memory) xpath_ast_node(type, rettype, value) : 0;
+		}
 
-			return result;
+		xpath_ast_node* alloc_node(ast_type_t type, xpath_value_type rettype, double value)
+		{
+			void* memory = alloc_node();
+			return memory ? new (memory) xpath_ast_node(type, rettype, value) : 0;
+		}
+
+		xpath_ast_node* alloc_node(ast_type_t type, xpath_value_type rettype, xpath_variable* value)
+		{
+			void* memory = alloc_node();
+			return memory ? new (memory) xpath_ast_node(type, rettype, value) : 0;
+		}
+
+		xpath_ast_node* alloc_node(ast_type_t type, xpath_value_type rettype, xpath_ast_node* left = 0, xpath_ast_node* right = 0)
+		{
+			void* memory = alloc_node();
+			return memory ? new (memory) xpath_ast_node(type, rettype, left, right) : 0;
+		}
+
+		xpath_ast_node* alloc_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents)
+		{
+			void* memory = alloc_node();
+			return memory ? new (memory) xpath_ast_node(type, left, axis, test, contents) : 0;
+		}
+
+		xpath_ast_node* alloc_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test)
+		{
+			void* memory = alloc_node();
+			return memory ? new (memory) xpath_ast_node(type, left, right, test) : 0;
 		}
 
 		const char_t* alloc_string(const xpath_lexer_string& value)
 		{
-			if (value.begin)
-			{
-				size_t length = static_cast<size_t>(value.end - value.begin);
+			if (!value.begin)
+				return PUGIXML_TEXT("");
 
-				char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
-				if (!c) throw_error_oom();
-				assert(c); // workaround for clang static analysis
+			size_t length = static_cast<size_t>(value.end - value.begin);
 
-				memcpy(c, value.begin, length * sizeof(char_t));
-				c[length] = 0;
+			char_t* c = static_cast<char_t*>(_alloc->allocate((length + 1) * sizeof(char_t)));
+			if (!c) return 0;
 
-				return c;
-			}
-			else return 0;
-		}
+			memcpy(c, value.begin, length * sizeof(char_t));
+			c[length] = 0;
 
-		xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
-		{
-			assert(argc <= 1);
-
-			if (argc == 1 && args[0]->rettype() != xpath_type_node_set)
-				throw_error("Function has to be applied to node set");
-
-			return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+			return c;
 		}
 
 		xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
@@ -10987,103 +11069,110 @@ PUGI__NS_BEGIN
 			{
 			case 'b':
 				if (name == PUGIXML_TEXT("boolean") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+					return alloc_node(ast_func_boolean, xpath_type_boolean, args[0]);
 
 				break;
 
 			case 'c':
 				if (name == PUGIXML_TEXT("count") && argc == 1)
 				{
-					if (args[0]->rettype() != xpath_type_node_set)
-						throw_error("Function has to be applied to node set");
-
-					return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+					if (args[0]->rettype() != xpath_type_node_set) return error("Function has to be applied to node set");
+					return alloc_node(ast_func_count, xpath_type_number, args[0]);
 				}
 				else if (name == PUGIXML_TEXT("contains") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]);
+					return alloc_node(ast_func_contains, xpath_type_boolean, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("concat") && argc >= 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+					return alloc_node(ast_func_concat, xpath_type_string, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+					return alloc_node(ast_func_ceiling, xpath_type_number, args[0]);
 
 				break;
 
 			case 'f':
 				if (name == PUGIXML_TEXT("false") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+					return alloc_node(ast_func_false, xpath_type_boolean);
 				else if (name == PUGIXML_TEXT("floor") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+					return alloc_node(ast_func_floor, xpath_type_number, args[0]);
 
 				break;
 
 			case 'i':
 				if (name == PUGIXML_TEXT("id") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+					return alloc_node(ast_func_id, xpath_type_node_set, args[0]);
 
 				break;
 
 			case 'l':
 				if (name == PUGIXML_TEXT("last") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+					return alloc_node(ast_func_last, xpath_type_number);
 				else if (name == PUGIXML_TEXT("lang") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+					return alloc_node(ast_func_lang, xpath_type_boolean, args[0]);
 				else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
-					return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+				{
+					if (argc == 1 && args[0]->rettype() != xpath_type_node_set) return error("Function has to be applied to node set");
+					return alloc_node(argc == 0 ? ast_func_local_name_0 : ast_func_local_name_1, xpath_type_string, args[0]);
+				}
 
 				break;
 
 			case 'n':
 				if (name == PUGIXML_TEXT("name") && argc <= 1)
-					return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+				{
+					if (argc == 1 && args[0]->rettype() != xpath_type_node_set) return error("Function has to be applied to node set");
+					return alloc_node(argc == 0 ? ast_func_name_0 : ast_func_name_1, xpath_type_string, args[0]);
+				}
 				else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
-					return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+				{
+					if (argc == 1 && args[0]->rettype() != xpath_type_node_set) return error("Function has to be applied to node set");
+					return alloc_node(argc == 0 ? ast_func_namespace_uri_0 : ast_func_namespace_uri_1, xpath_type_string, args[0]);
+				}
 				else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+					return alloc_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("not") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+					return alloc_node(ast_func_not, xpath_type_boolean, args[0]);
 				else if (name == PUGIXML_TEXT("number") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+					return alloc_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
 
 				break;
 
 			case 'p':
 				if (name == PUGIXML_TEXT("position") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+					return alloc_node(ast_func_position, xpath_type_number);
 
 				break;
 
 			case 'r':
 				if (name == PUGIXML_TEXT("round") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+					return alloc_node(ast_func_round, xpath_type_number, args[0]);
 
 				break;
 
 			case 's':
 				if (name == PUGIXML_TEXT("string") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+					return alloc_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
 				else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]);
+					return alloc_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]);
 				else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+					return alloc_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+					return alloc_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+					return alloc_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
-					return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+					return alloc_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("sum") && argc == 1)
 				{
-					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
-					return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+					if (args[0]->rettype() != xpath_type_node_set) return error("Function has to be applied to node set");
+					return alloc_node(ast_func_sum, xpath_type_number, args[0]);
 				}
 
 				break;
 
 			case 't':
 				if (name == PUGIXML_TEXT("translate") && argc == 3)
-					return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+					return alloc_node(ast_func_translate, xpath_type_string, args[0], args[1]);
 				else if (name == PUGIXML_TEXT("true") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+					return alloc_node(ast_func_true, xpath_type_boolean);
 
 				break;
 
@@ -11091,9 +11180,7 @@ PUGI__NS_BEGIN
 				break;
 			}
 
-			throw_error("Unrecognized function or wrong parameter count");
-
-			return 0;
+			return error("Unrecognized function or wrong parameter count");
 		}
 
 		axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
@@ -11209,18 +11296,18 @@ PUGI__NS_BEGIN
 				xpath_lexer_string name = _lexer.contents();
 
 				if (!_variables)
-					throw_error("Unknown variable: variable set is not provided");
+					return error("Unknown variable: variable set is not provided");
 
 				xpath_variable* var = 0;
 				if (!get_variable_scratch(_scratch, _variables, name.begin, name.end, &var))
-					throw_error_oom();
+					return error_oom();
 
 				if (!var)
-					throw_error("Unknown variable: variable set does not contain the given name");
+					return error("Unknown variable: variable set does not contain the given name");
 
 				_lexer.next();
 
-				return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+				return alloc_node(ast_variable, var->type(), var);
 			}
 
 			case lex_open_brace:
@@ -11228,9 +11315,10 @@ PUGI__NS_BEGIN
 				_lexer.next();
 
 				xpath_ast_node* n = parse_expression();
+				if (!n) return 0;
 
 				if (_lexer.current() != lex_close_brace)
-					throw_error("Unmatched braces");
+					return error("Expected ')' to match an opening '('");
 
 				_lexer.next();
 
@@ -11240,11 +11328,11 @@ PUGI__NS_BEGIN
 			case lex_quoted_string:
 			{
 				const char_t* value = alloc_string(_lexer.contents());
+				if (!value) return 0;
 
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
 				_lexer.next();
 
-				return n;
+				return alloc_node(ast_string_constant, xpath_type_string, value);
 			}
 
 			case lex_number:
@@ -11252,12 +11340,11 @@ PUGI__NS_BEGIN
 				double value = 0;
 
 				if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value))
-					throw_error_oom();
+					return error_oom();
 
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
 				_lexer.next();
 
-				return n;
+				return alloc_node(ast_number_constant, xpath_type_number, value);
 			}
 
 			case lex_string:
@@ -11271,19 +11358,20 @@ PUGI__NS_BEGIN
 				xpath_ast_node* last_arg = 0;
 
 				if (_lexer.current() != lex_open_brace)
-					throw_error("Unrecognized function call");
+					return error("Unrecognized function call");
 				_lexer.next();
 
-				if (_lexer.current() != lex_close_brace)
-					args[argc++] = parse_expression();
-
 				while (_lexer.current() != lex_close_brace)
 				{
-					if (_lexer.current() != lex_comma)
-						throw_error("No comma between function arguments");
-					_lexer.next();
+					if (argc > 0)
+					{
+						if (_lexer.current() != lex_comma)
+							return error("No comma between function arguments");
+						_lexer.next();
+					}
 
 					xpath_ast_node* n = parse_expression();
+					if (!n) return 0;
 
 					if (argc < 2) args[argc] = n;
 					else last_arg->set_next(n);
@@ -11298,9 +11386,7 @@ PUGI__NS_BEGIN
 			}
 
 			default:
-				throw_error("Unrecognizable primary expression");
-
-				return 0;
+				return error("Unrecognizable primary expression");
 			}
 		}
 
@@ -11310,20 +11396,23 @@ PUGI__NS_BEGIN
 		xpath_ast_node* parse_filter_expression()
 		{
 			xpath_ast_node* n = parse_primary_expression();
+			if (!n) return 0;
 
 			while (_lexer.current() == lex_open_square_brace)
 			{
 				_lexer.next();
 
-				xpath_ast_node* expr = parse_expression();
-
 				if (n->rettype() != xpath_type_node_set)
-					throw_error("Predicate has to be applied to node set");
+					return error("Predicate has to be applied to node set");
 
-				n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default);
+				xpath_ast_node* expr = parse_expression();
+				if (!expr) return 0;
+
+				n = alloc_node(ast_filter, n, expr, predicate_default);
+				if (!n) return 0;
 
 				if (_lexer.current() != lex_close_square_brace)
-					throw_error("Unmatched square brace");
+					return error("Expected ']' to match an opening '['");
 
 				_lexer.next();
 			}
@@ -11339,7 +11428,7 @@ PUGI__NS_BEGIN
 		xpath_ast_node* parse_step(xpath_ast_node* set)
 		{
 			if (set && set->rettype() != xpath_type_node_set)
-				throw_error("Step has to be applied to node set");
+				return error("Step has to be applied to node set");
 
 			bool axis_specified = false;
 			axis_t axis = axis_child; // implied child axis
@@ -11355,13 +11444,19 @@ PUGI__NS_BEGIN
 			{
 				_lexer.next();
 
-				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+				if (_lexer.current() == lex_open_square_brace)
+					return error("Predicates are not allowed after an abbreviated step");
+
+				return alloc_node(ast_step, set, axis_self, nodetest_type_node, 0);
 			}
 			else if (_lexer.current() == lex_double_dot)
 			{
 				_lexer.next();
 
-				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+				if (_lexer.current() == lex_open_square_brace)
+					return error("Predicates are not allowed after an abbreviated step");
+
+				return alloc_node(ast_step, set, axis_parent, nodetest_type_node, 0);
 			}
 
 			nodetest_t nt_type = nodetest_none;
@@ -11378,12 +11473,12 @@ PUGI__NS_BEGIN
 				{
 					// parse axis name
 					if (axis_specified)
-						throw_error("Two axis specifiers in one step");
+						return error("Two axis specifiers in one step");
 
 					axis = parse_axis_name(nt_name, axis_specified);
 
 					if (!axis_specified)
-						throw_error("Unknown axis");
+						return error("Unknown axis");
 
 					// read actual node test
 					_lexer.next();
@@ -11399,7 +11494,10 @@ PUGI__NS_BEGIN
 						nt_name = _lexer.contents();
 						_lexer.next();
 					}
-					else throw_error("Unrecognized node test");
+					else
+					{
+						return error("Unrecognized node test");
+					}
 				}
 
 				if (nt_type == nodetest_none)
@@ -11416,26 +11514,26 @@ PUGI__NS_BEGIN
 							nt_type = parse_node_test_type(nt_name);
 
 							if (nt_type == nodetest_none)
-								throw_error("Unrecognized node type");
+								return error("Unrecognized node type");
 
 							nt_name = xpath_lexer_string();
 						}
 						else if (nt_name == PUGIXML_TEXT("processing-instruction"))
 						{
 							if (_lexer.current() != lex_quoted_string)
-								throw_error("Only literals are allowed as arguments to processing-instruction()");
+								return error("Only literals are allowed as arguments to processing-instruction()");
 
 							nt_type = nodetest_pi;
 							nt_name = _lexer.contents();
 							_lexer.next();
 
 							if (_lexer.current() != lex_close_brace)
-								throw_error("Unmatched brace near processing-instruction()");
+								return error("Unmatched brace near processing-instruction()");
 							_lexer.next();
 						}
 						else
 						{
-							throw_error("Unmatched brace near node type test");
+							return error("Unmatched brace near node type test");
 						}
 					}
 					// QName or NCName:*
@@ -11461,11 +11559,14 @@ PUGI__NS_BEGIN
 			}
 			else
 			{
-				throw_error("Unrecognized node test");
+				return error("Unrecognized node test");
 			}
 
 			const char_t* nt_name_copy = alloc_string(nt_name);
-			xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, nt_name_copy);
+			if (!nt_name_copy) return 0;
+
+			xpath_ast_node* n = alloc_node(ast_step, set, axis, nt_type, nt_name_copy);
+			if (!n) return 0;
 
 			xpath_ast_node* last = 0;
 
@@ -11474,11 +11575,13 @@ PUGI__NS_BEGIN
 				_lexer.next();
 
 				xpath_ast_node* expr = parse_expression();
+				if (!expr) return 0;
 
-				xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default);
+				xpath_ast_node* pred = alloc_node(ast_predicate, 0, expr, predicate_default);
+				if (!pred) return 0;
 
 				if (_lexer.current() != lex_close_square_brace)
-					throw_error("Unmatched square brace");
+					return error("Expected ']' to match an opening '['");
 				_lexer.next();
 
 				if (last) last->set_next(pred);
@@ -11494,6 +11597,7 @@ PUGI__NS_BEGIN
 		xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
 		{
 			xpath_ast_node* n = parse_step(set);
+			if (!n) return 0;
 
 			while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
 			{
@@ -11501,9 +11605,13 @@ PUGI__NS_BEGIN
 				_lexer.next();
 
 				if (l == lex_double_slash)
-					n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				{
+					n = alloc_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+					if (!n) return 0;
+				}
 
 				n = parse_step(n);
+				if (!n) return 0;
 			}
 
 			return n;
@@ -11517,7 +11625,8 @@ PUGI__NS_BEGIN
 			{
 				_lexer.next();
 
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+				xpath_ast_node* n = alloc_node(ast_step_root, xpath_type_node_set);
+				if (!n) return 0;
 
 				// relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
 				lexeme_t l = _lexer.current();
@@ -11531,8 +11640,11 @@ PUGI__NS_BEGIN
 			{
 				_lexer.next();
 
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
-				n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				xpath_ast_node* n = alloc_node(ast_step_root, xpath_type_node_set);
+				if (!n) return 0;
+
+				n = alloc_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+				if (!n) return 0;
 
 				return parse_relative_location_path(n);
 			}
@@ -11555,7 +11667,6 @@ PUGI__NS_BEGIN
 			// PrimaryExpr begins with '$' in case of it being a variable reference,
 			// '(' in case of it being an expression, string literal, number constant or
 			// function call.
-
 			if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace ||
 				_lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
 				_lexer.current() == lex_string)
@@ -11567,7 +11678,8 @@ PUGI__NS_BEGIN
 
 					while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
 
-					if (*state != '(') return parse_location_path();
+					if (*state != '(')
+						return parse_location_path();
 
 					// This looks like a function call; however this still can be a node-test. Check it.
 					if (parse_node_test_type(_lexer.contents()) != nodetest_none)
@@ -11575,6 +11687,7 @@ PUGI__NS_BEGIN
 				}
 
 				xpath_ast_node* n = parse_filter_expression();
+				if (!n) return 0;
 
 				if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
 				{
@@ -11584,9 +11697,10 @@ PUGI__NS_BEGIN
 					if (l == lex_double_slash)
 					{
 						if (n->rettype() != xpath_type_node_set)
-							throw_error("Step has to be applied to node set");
+							return error("Step has to be applied to node set");
 
-						n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+						n = alloc_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+						if (!n) return 0;
 					}
 
 					// select from location path
@@ -11600,9 +11714,10 @@ PUGI__NS_BEGIN
 				_lexer.next();
 
 				// precedence 7+ - only parses union expressions
-				xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7);
+				xpath_ast_node* n = parse_expression(7);
+				if (!n) return 0;
 
-				return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+				return alloc_node(ast_op_negate, xpath_type_number, n);
 			}
 			else
 			{
@@ -11685,20 +11800,23 @@ PUGI__NS_BEGIN
 				_lexer.next();
 
 				xpath_ast_node* rhs = parse_path_or_unary_expression();
+				if (!rhs) return 0;
 
 				binary_op_t nextop = binary_op_t::parse(_lexer);
 
 				while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence)
 				{
 					rhs = parse_expression_rec(rhs, nextop.precedence);
+					if (!rhs) return 0;
 
 					nextop = binary_op_t::parse(_lexer);
 				}
 
 				if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set))
-					throw_error("Union operator has to be applied to node sets");
+					return error("Union operator has to be applied to node sets");
 
-				lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs);
+				lhs = alloc_node(op.asttype, op.rettype, lhs, rhs);
+				if (!lhs) return 0;
 
 				op = binary_op_t::parse(_lexer);
 			}
@@ -11724,9 +11842,12 @@ PUGI__NS_BEGIN
 		//						  | MultiplicativeExpr '*' UnaryExpr
 		//						  | MultiplicativeExpr 'div' UnaryExpr
 		//						  | MultiplicativeExpr 'mod' UnaryExpr
-		xpath_ast_node* parse_expression()
+		xpath_ast_node* parse_expression(int limit = 0)
 		{
-			return parse_expression_rec(parse_path_or_unary_expression(), 0);
+			xpath_ast_node* n = parse_path_or_unary_expression();
+			if (!n) return 0;
+
+			return parse_expression_rec(n, limit);
 		}
 
 		xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
@@ -11735,26 +11856,21 @@ PUGI__NS_BEGIN
 
 		xpath_ast_node* parse()
 		{
-			xpath_ast_node* result = parse_expression();
+			xpath_ast_node* n = parse_expression();
+			if (!n) return 0;
 
 			// check if there are unparsed tokens left
 			if (_lexer.current() != lex_eof)
-				throw_error("Incorrect query");
+				return error("Incorrect query");
 
-			return result;
+			return n;
 		}
 
 		static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
 		{
 			xpath_parser parser(query, variables, alloc, result);
 
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			int error = setjmp(parser._error_handler);
-
-			return (error == 0) ? parser.parse() : 0;
-		#else
 			return parser.parse();
-		#endif
 		}
 	};
 
@@ -11777,7 +11893,7 @@ PUGI__NS_BEGIN
 			xml_memory::deallocate(impl);
 		}
 
-		xpath_query_impl(): root(0), alloc(&block)
+		xpath_query_impl(): root(0), alloc(&block, &oom), oom(false)
 		{
 			block.next = 0;
 			block.capacity = sizeof(block.data);
@@ -11786,21 +11902,9 @@ PUGI__NS_BEGIN
 		xpath_ast_node* root;
 		xpath_allocator alloc;
 		xpath_memory_block block;
+		bool oom;
 	};
 
-	PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
-	{
-		if (!impl) return xpath_string();
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return xpath_string();
-	#endif
-
-		xpath_context c(n, 1, 1);
-
-		return impl->root->eval_string(c, sd.stack);
-	}
-
 	PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl)
 	{
 		if (!impl) return 0;
@@ -11948,7 +12052,7 @@ namespace pugi
 	}
 
 #ifdef PUGIXML_HAS_MOVE
-	PUGI__FN void xpath_node_set::_move(xpath_node_set& rhs)
+	PUGI__FN void xpath_node_set::_move(xpath_node_set& rhs) PUGIXML_NOEXCEPT
 	{
 		_type = rhs._type;
 		_storage = rhs._storage;
@@ -11991,12 +12095,12 @@ namespace pugi
 	}
 
 #ifdef PUGIXML_HAS_MOVE
-	PUGI__FN xpath_node_set::xpath_node_set(xpath_node_set&& rhs): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+	PUGI__FN xpath_node_set::xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT: _type(type_unsorted), _begin(&_storage), _end(&_storage)
 	{
 		_move(rhs);
 	}
 
-	PUGI__FN xpath_node_set& xpath_node_set::operator=(xpath_node_set&& rhs)
+	PUGI__FN xpath_node_set& xpath_node_set::operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT
 	{
 		if (this == &rhs) return *this;
 
@@ -12085,7 +12189,7 @@ namespace pugi
 			return static_cast<const impl::xpath_variable_boolean*>(this)->name;
 
 		default:
-			assert(false && "Invalid variable type");
+			assert(false && "Invalid variable type"); // unreachable
 			return 0;
 		}
 	}
@@ -12191,7 +12295,7 @@ namespace pugi
 	}
 
 #ifdef PUGIXML_HAS_MOVE
-	PUGI__FN xpath_variable_set::xpath_variable_set(xpath_variable_set&& rhs)
+	PUGI__FN xpath_variable_set::xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT
 	{
 		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
 		{
@@ -12200,7 +12304,7 @@ namespace pugi
 		}
 	}
 
-	PUGI__FN xpath_variable_set& xpath_variable_set::operator=(xpath_variable_set&& rhs)
+	PUGI__FN xpath_variable_set& xpath_variable_set::operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT
 	{
 		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
 		{
@@ -12371,6 +12475,15 @@ namespace pugi
 				_impl = impl.release();
 				_result.error = 0;
 			}
+			else
+			{
+			#ifdef PUGIXML_NO_EXCEPTIONS
+				if (qimpl->oom) _result.error = "Out of memory";
+			#else
+				if (qimpl->oom) throw std::bad_alloc();
+				throw xpath_exception(_result);
+			#endif
+			}
 		}
 	}
 
@@ -12385,7 +12498,7 @@ namespace pugi
 	}
 
 #ifdef PUGIXML_HAS_MOVE
-	PUGI__FN xpath_query::xpath_query(xpath_query&& rhs)
+	PUGI__FN xpath_query::xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT
 	{
 		_impl = rhs._impl;
 		_result = rhs._result;
@@ -12393,7 +12506,7 @@ namespace pugi
 		rhs._result = xpath_parse_result();
 	}
 
-	PUGI__FN xpath_query& xpath_query::operator=(xpath_query&& rhs)
+	PUGI__FN xpath_query& xpath_query::operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT
 	{
 		if (this == &rhs) return *this;
 
@@ -12423,11 +12536,18 @@ namespace pugi
 		impl::xpath_context c(n, 1, 1);
 		impl::xpath_stack_data sd;
 
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return false;
-	#endif
+		bool r = static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
 
-		return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+		if (sd.oom)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return false;
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+
+		return r;
 	}
 
 	PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
@@ -12437,19 +12557,38 @@ namespace pugi
 		impl::xpath_context c(n, 1, 1);
 		impl::xpath_stack_data sd;
 
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return impl::gen_nan();
-	#endif
+		double r = static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
 
-		return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+		if (sd.oom)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return impl::gen_nan();
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+
+		return r;
 	}
 
 #ifndef PUGIXML_NO_STL
 	PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
 	{
+		if (!_impl) return string_t();
+
+		impl::xpath_context c(n, 1, 1);
 		impl::xpath_stack_data sd;
 
-		impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+		impl::xpath_string r = static_cast<impl::xpath_query_impl*>(_impl)->root->eval_string(c, sd.stack);
+
+		if (sd.oom)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return string_t();
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
 
 		return string_t(r.c_str(), r.length());
 	}
@@ -12457,9 +12596,19 @@ namespace pugi
 
 	PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
 	{
+		impl::xpath_context c(n, 1, 1);
 		impl::xpath_stack_data sd;
 
-		impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+		impl::xpath_string r = _impl ? static_cast<impl::xpath_query_impl*>(_impl)->root->eval_string(c, sd.stack) : impl::xpath_string();
+
+		if (sd.oom)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			r = impl::xpath_string();
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
 
 		size_t full_size = r.length() + 1;
 
@@ -12483,12 +12632,17 @@ namespace pugi
 		impl::xpath_context c(n, 1, 1);
 		impl::xpath_stack_data sd;
 
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return xpath_node_set();
-	#endif
-
 		impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all);
 
+		if (sd.oom)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return xpath_node_set();
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+
 		return xpath_node_set(r.begin(), r.end(), r.type());
 	}
 
@@ -12500,12 +12654,17 @@ namespace pugi
 		impl::xpath_context c(n, 1, 1);
 		impl::xpath_stack_data sd;
 
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return xpath_node();
-	#endif
-
 		impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first);
 
+		if (sd.oom)
+		{
+		#ifdef PUGIXML_NO_EXCEPTIONS
+			return xpath_node();
+		#else
+			throw std::bad_alloc();
+		#endif
+		}
+
 		return r.first();
 	}
 
@@ -12531,7 +12690,7 @@ namespace pugi
 	PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const
 	{
 		xpath_query q(query, variables);
-		return select_node(q);
+		return q.evaluate_node(*this);
 	}
 
 	PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const
@@ -12542,7 +12701,7 @@ namespace pugi
 	PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
 	{
 		xpath_query q(query, variables);
-		return select_nodes(q);
+		return q.evaluate_node_set(*this);
 	}
 
 	PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
@@ -12553,7 +12712,7 @@ namespace pugi
 	PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
 	{
 		xpath_query q(query, variables);
-		return select_single_node(q);
+		return q.evaluate_node(*this);
 	}
 
 	PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
@@ -12574,12 +12733,18 @@ namespace pugi
 #	pragma warning(pop)
 #endif
 
+#if defined(_MSC_VER) && defined(__c2__)
+#	pragma clang diagnostic pop
+#endif
+
 // Undefine all local macros (makes sure we're not leaking macros in header-only mode)
 #undef PUGI__NO_INLINE
 #undef PUGI__UNLIKELY
 #undef PUGI__STATIC_ASSERT
 #undef PUGI__DMC_VOLATILE
+#undef PUGI__UNSIGNED_OVERFLOW
 #undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__SNPRINTF
 #undef PUGI__NS_BEGIN
 #undef PUGI__NS_END
 #undef PUGI__FN
@@ -12606,7 +12771,7 @@ namespace pugi
 #endif
 
 /**
- * Copyright (c) 2006-2016 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/lib/pugixml/pugixml.h b/Grid/pugixml/pugixml.h
similarity index 96%
rename from lib/pugixml/pugixml.h
rename to Grid/pugixml/pugixml.h
index 6288d4bd..86403be3 100644
--- a/lib/pugixml/pugixml.h
+++ b/Grid/pugixml/pugixml.h
@@ -1,7 +1,7 @@
 /**
- * pugixml parser - version 1.8
+ * pugixml parser - version 1.9
  * --------------------------------------------------------
- * Copyright (C) 2006-2016, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at http://pugixml.org/
  *
  * This library is distributed under the MIT License. See notice at the end
@@ -13,7 +13,7 @@
 
 #ifndef PUGIXML_VERSION
 // Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
-#	define PUGIXML_VERSION 180
+#	define PUGIXML_VERSION 190
 #endif
 
 // Include user configuration file (this can define various configuration macros)
@@ -81,10 +81,30 @@
 #	endif
 #endif
 
+// If C++ is 2011 or higher, add 'noexcept' specifiers
+#ifndef PUGIXML_NOEXCEPT
+#	if __cplusplus >= 201103
+#		define PUGIXML_NOEXCEPT noexcept
+#	elif defined(_MSC_VER) && _MSC_VER >= 1900
+#		define PUGIXML_NOEXCEPT noexcept
+#	else
+#		define PUGIXML_NOEXCEPT
+#	endif
+#endif
+
+// Some functions can not be noexcept in compact mode
+#ifdef PUGIXML_COMPACT
+#	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT
+#else
+#	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT
+#endif
+
 // If C++ is 2011 or higher, add 'override' qualifiers
 #ifndef PUGIXML_OVERRIDE
 #	if __cplusplus >= 201103
 #		define PUGIXML_OVERRIDE override
+#	elif defined(_MSC_VER) && _MSC_VER >= 1700
+#		define PUGIXML_OVERRIDE override
 #	else
 #		define PUGIXML_OVERRIDE
 #	endif
@@ -631,8 +651,8 @@ namespace pugi
 		xpath_node_set select_nodes(const xpath_query& query) const;
 
 		// (deprecated: use select_node instead) Select single node by evaluating XPath query.
-		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
-		xpath_node select_single_node(const xpath_query& query) const;
+		PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+		PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const;
 
 	#endif
 
@@ -983,6 +1003,7 @@ namespace pugi
 
 		void _create();
 		void _destroy();
+		void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
 
 	public:
 		// Default constructor, makes empty document
@@ -991,6 +1012,12 @@ namespace pugi
 		// Destructor, invalidates all node/attribute handles to this document
 		~xml_document();
 
+	#ifdef PUGIXML_HAS_MOVE
+		// Move semantics support
+		xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
+		xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
+	#endif
+
 		// Removes all nodes, leaving the empty document
 		void reset();
 
@@ -1004,7 +1031,7 @@ namespace pugi
 	#endif
 
 		// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
-		xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+		PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
 
 		// Load document from zero-terminated string. No encoding conversions are applied.
 		xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
@@ -1131,8 +1158,8 @@ namespace pugi
 
 	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_variable_set(xpath_variable_set&& rhs);
-		xpath_variable_set& operator=(xpath_variable_set&& rhs);
+		xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
+		xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
 	#endif
 
 		// Add a new variable or get the existing one, if the types match
@@ -1175,8 +1202,8 @@ namespace pugi
 
 	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_query(xpath_query&& rhs);
-		xpath_query& operator=(xpath_query&& rhs);
+		xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT;
+		xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT;
 	#endif
 
 		// Get query expression return type
@@ -1316,8 +1343,8 @@ namespace pugi
 
 	#ifdef PUGIXML_HAS_MOVE
 		// Move semantics support
-		xpath_node_set(xpath_node_set&& rhs);
-		xpath_node_set& operator=(xpath_node_set&& rhs);
+		xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
+		xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
 	#endif
 
 		// Get collection type
@@ -1351,7 +1378,7 @@ namespace pugi
 		xpath_node* _end;
 
 		void _assign(const_iterator begin, const_iterator end, type_t type);
-		void _move(xpath_node_set& rhs);
+		void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT;
 	};
 #endif
 
@@ -1409,7 +1436,7 @@ namespace std
 #endif
 
 /**
- * Copyright (c) 2006-2016 Arseny Kapoulkine
+ * Copyright (c) 2006-2018 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/lib/pugixml/readme.txt b/Grid/pugixml/readme.txt
similarity index 93%
rename from lib/pugixml/readme.txt
rename to Grid/pugixml/readme.txt
index faa41d37..5beb08a9 100644
--- a/lib/pugixml/readme.txt
+++ b/Grid/pugixml/readme.txt
@@ -1,6 +1,6 @@
-pugixml 1.6 - an XML processing library
+pugixml 1.9 - an XML processing library
 
-Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 Report bugs and download new versions at http://pugixml.org/
 
 This is the distribution of pugixml, which is a C++ XML processing library,
@@ -28,7 +28,7 @@ The distribution contains the following folders:
 
 This library is distributed under the MIT License:
 
-Copyright (c) 2006-2015 Arseny Kapoulkine
+Copyright (c) 2006-2018 Arseny Kapoulkine
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
diff --git a/lib/qcd/LatticeTheories.h b/Grid/qcd/LatticeTheories.h
similarity index 100%
rename from lib/qcd/LatticeTheories.h
rename to Grid/qcd/LatticeTheories.h
diff --git a/lib/qcd/QCD.h b/Grid/qcd/QCD.h
similarity index 90%
rename from lib/qcd/QCD.h
rename to Grid/qcd/QCD.h
index 8816c5dc..2c8e60da 100644
--- a/lib/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -29,8 +29,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_QCD_BASE_H
-#define GRID_QCD_BASE_H
+#pragma once
+
 NAMESPACE_BEGIN(Grid);
 
 static constexpr int Xdir = 0;
@@ -88,6 +88,7 @@ template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::va
 // That probably makes for GridRedBlack4dCartesian grid.
 
 // s,sp,c,spc,lc
+
 template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
 template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
 template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
@@ -99,6 +100,8 @@ template<typename vtype> using iColourVector              = iScalar<iScalar<iVec
 template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
 template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
 template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
+
 
 template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
 template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
@@ -130,6 +133,24 @@ typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
 
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+
+    // SpinColourSpinColour matrix
+    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+
+    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+
 // LorentzColour
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
 typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
@@ -227,6 +248,9 @@ typedef Lattice<vSpinColourMatrix>      LatticeSpinColourMatrix;
 typedef Lattice<vSpinColourMatrixF>     LatticeSpinColourMatrixF;
 typedef Lattice<vSpinColourMatrixD>     LatticeSpinColourMatrixD;
 
+typedef Lattice<vSpinColourSpinColourMatrix>      LatticeSpinColourSpinColourMatrix;
+typedef Lattice<vSpinColourSpinColourMatrixF>     LatticeSpinColourSpinColourMatrixF;
+typedef Lattice<vSpinColourSpinColourMatrixD>     LatticeSpinColourSpinColourMatrixD;
 
 typedef Lattice<vLorentzColourMatrix>  LatticeLorentzColourMatrix;
 typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
@@ -419,15 +443,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 //////////////////////////////////////////////
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
-template <class Prop, class Ferm>
-void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
   for(int j = 0; j < Ns; ++j)
     {
       auto pjs = peekSpin(p, j, s);
       auto fj  = peekSpin(f, j);
             
-      for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
 	{
 	  pokeColour(pjs, peekColour(fj, i), i, c);
 	}
@@ -435,15 +460,16 @@ void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
     }
 }
     
-template <class Prop, class Ferm>
-void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    //template <class Prop, class Ferm>
+    template <class Fimpl>
+      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
   for(int j = 0; j < Ns; ++j)
     {
       auto pjs = peekSpin(p, j, s);
       auto fj  = peekSpin(f, j);
             
-      for(int i = 0; i < Nc; ++i)
+            for(int i = 0; i < Fimpl::Dimension; ++i)
 	{
 	  pokeColour(fj, peekColour(pjs, i, c), i);
 	}
@@ -501,4 +527,3 @@ GRID_SERIALIZABLE_ENUM(Current, undef,
 
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/lib/qcd/action/Action.h b/Grid/qcd/action/Action.h
similarity index 94%
rename from lib/qcd/action/Action.h
rename to Grid/qcd/action/Action.h
index 7272c90d..737c1ff0 100644
--- a/lib/qcd/action/Action.h
+++ b/Grid/qcd/action/Action.h
@@ -37,14 +37,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 // Abstract base interface
 ////////////////////////////////////////////
 #include <Grid/qcd/action/ActionCore.h>
+NAMESPACE_CHECK(ActionCore);
 ////////////////////////////////////////////////////////////////////////
 // Fermion actions; prevent coupling fermion.cc files to other headers
 ////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FermionCore.h>
+NAMESPACE_CHECK(FermionCore);
 #include <Grid/qcd/action/fermion/Fermion.h>
+NAMESPACE_CHECK(Fermion);
 ////////////////////////////////////////
 // Pseudo fermion combinations for HMC
 ////////////////////////////////////////
 #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
+NAMESPACE_CHECK(PseudoFermion);
 
 #endif
diff --git a/lib/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h
similarity index 100%
rename from lib/qcd/action/ActionBase.h
rename to Grid/qcd/action/ActionBase.h
diff --git a/lib/qcd/action/ActionCore.h b/Grid/qcd/action/ActionCore.h
similarity index 89%
rename from lib/qcd/action/ActionCore.h
rename to Grid/qcd/action/ActionCore.h
index 7a5caf15..6544318d 100644
--- a/lib/qcd/action/ActionCore.h
+++ b/Grid/qcd/action/ActionCore.h
@@ -31,29 +31,37 @@ directory
 #define QCD_ACTION_CORE
 
 #include <Grid/qcd/action/ActionBase.h>
+NAMESPACE_CHECK(ActionBase);
 #include <Grid/qcd/action/ActionSet.h>
+NAMESPACE_CHECK(ActionSet);
 #include <Grid/qcd/action/ActionParams.h>
+NAMESPACE_CHECK(ActionParams);
 
 ////////////////////////////////////////////
 // Gauge Actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/gauge/Gauge.h>
+NAMESPACE_CHECK(Gauge);
 
 ////////////////////////////////////////////
 // Fermion prereqs
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/FermionCore.h>
+NAMESPACE_CHECK(ActionFermionCore);
 
 ////////////////////////////////////////////
 // Scalar Actions
 ////////////////////////////////////////////
 #include <Grid/qcd/action/scalar/Scalar.h>
+NAMESPACE_CHECK(Scalar);
 
 ////////////////////////////////////////////
 // Utility functions
 ////////////////////////////////////////////
 #include <Grid/qcd/utils/Metric.h>
+NAMESPACE_CHECK(Metric);
 #include <Grid/qcd/utils/CovariantLaplacian.h>
+NAMESPACE_CHECK(CovariantLaplacian);
 
 
 
diff --git a/lib/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h
similarity index 90%
rename from lib/qcd/action/ActionParams.h
rename to Grid/qcd/action/ActionParams.h
index 1e0af1d7..cce9d7c9 100644
--- a/lib/qcd/action/ActionParams.h
+++ b/Grid/qcd/action/ActionParams.h
@@ -41,12 +41,16 @@ struct GparityWilsonImplParams {
 };
   
 struct WilsonImplParams {
+  bool overlapCommsCompute;
+  std::vector<Real> twist_n_2pi_L;
   std::vector<Complex> boundary_phases;
   WilsonImplParams()  {
     boundary_phases.resize(Nd, 1.0);
+      twist_n_2pi_L.resize(Nd, 0.0);
   };
-  WilsonImplParams(const std::vector<Complex> phi)
-    : boundary_phases(phi) {}
+  WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
+    twist_n_2pi_L.resize(Nd, 0.0);
+  }
 };
 
 struct StaggeredImplParams {
diff --git a/lib/qcd/action/ActionSet.h b/Grid/qcd/action/ActionSet.h
similarity index 100%
rename from lib/qcd/action/ActionSet.h
rename to Grid/qcd/action/ActionSet.h
diff --git a/lib/qcd/action/fermion/AbstractEOFAFermion.h b/Grid/qcd/action/fermion/AbstractEOFAFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/AbstractEOFAFermion.h
rename to Grid/qcd/action/fermion/AbstractEOFAFermion.h
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/Grid/qcd/action/fermion/CayleyFermion5D.cc
similarity index 84%
rename from lib/qcd/action/fermion/CayleyFermion5D.cc
rename to Grid/qcd/action/fermion/CayleyFermion5D.cc
index 52ef9f82..4a6c4c91 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.cc
@@ -51,6 +51,82 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
 { 
 }
 
+///////////////////////////////////////////////////////////////
+// Physical surface field utilities
+///////////////////////////////////////////////////////////////
+template<class Impl>  
+void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  tmp = solution5d;
+  conformable(solution5d.Grid(),this->FermionGrid());
+  conformable(exported4d.Grid(),this->GaugeGrid());
+  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
+  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
+  ExtractSlice(exported4d, tmp, 0, 0);
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
+{
+  int Ls= this->Ls;
+  chi=Zero();
+  for(int s=0;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
+{
+  int Ls= this->Ls;
+  chi=Zero();
+  for(int s=0;s<Ls;s++){
+    axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
+    axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
+  }
+}
+template<class Impl>  
+void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solution5d,FermionField &exported4d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  tmp = solution5d;
+  conformable(solution5d.Grid(),this->FermionGrid());
+  conformable(exported4d.Grid(),this->GaugeGrid());
+  axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
+  axpby_ssp_pminus(tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
+  ExtractSlice(exported4d, tmp, 0, 0);
+}
+template<class Impl>
+void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,FermionField &imported5d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  conformable(imported5d.Grid(),this->FermionGrid());
+  conformable(input4d.Grid()   ,this->GaugeGrid());
+  tmp = Zero();
+  InsertSlice(input4d, tmp, 0   , 0);
+  InsertSlice(input4d, tmp, Ls-1, 0);
+  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
+  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
+  imported5d=tmp;
+}
+
+template<class Impl>  
+void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+{
+  int Ls = this->Ls;
+  FermionField tmp(this->FermionGrid());
+  conformable(imported5d.Grid(),this->FermionGrid());
+  conformable(input4d.Grid()   ,this->GaugeGrid());
+  tmp = Zero();
+  InsertSlice(input4d, tmp, 0   , 0);
+  InsertSlice(input4d, tmp, Ls-1, 0);
+  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
+  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
+  Dminus(tmp,imported5d);
+}
 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
@@ -72,7 +148,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
   this->DW(psi,tmp_f,DaggerYes);
 
   for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
   }
 }
 
@@ -404,9 +480,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
     
   double bpc = b+c;
   double bmc = b-c;
+  _b = b;
+  _c = c;
+  _gamma  = gamma; // Save the parameters so we can change mass later.
+  _zolo_hi= zolo_hi;
   for(int i=0; i < Ls; i++){
     as[i] = 1.0;
-    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
     assert(omega[i]!=Coeff_t(0.0));
     bs[i] = 0.5*(bpc/omega[i] + bmc);
     cs[i] = 0.5*(bpc/omega[i] - bmc);
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h
similarity index 85%
rename from lib/qcd/action/fermion/CayleyFermion5D.h
rename to Grid/qcd/action/fermion/CayleyFermion5D.h
index dcbfb2d0..6dbc630e 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/Grid/qcd/action/fermion/CayleyFermion5D.h
@@ -81,8 +81,26 @@ public:
   virtual void   M5D   (const FermionField &psi, FermionField &chi);
   virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
 
+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
   virtual void   Dminus(const FermionField &psi, FermionField &chi);
   virtual void   DminusDag(const FermionField &psi, FermionField &chi);
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
+      virtual void ImportUnphysicalFermion(const FermionField &solution5d, FermionField &exported4d);
+
+      ///////////////////////////////////////////////////////////////
+      // Support for MADWF tricks
+      ///////////////////////////////////////////////////////////////
+      RealD Mass(void) { return mass; };
+      void  SetMass(RealD _mass) { 
+	mass=_mass; 
+	SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c);  // Reset coeffs
+      } ;
+      void  P(const FermionField &psi, FermionField &chi);
+      void  Pdag(const FermionField &psi, FermionField &chi);
 
   /////////////////////////////////////////////////////
   // Instantiate different versions depending on Impl
@@ -130,6 +148,12 @@ public:
   //    protected:
   RealD mass;
 
+      // Save arguments to SetCoefficientsInternal
+      std::vector<Coeff_t> _gamma;
+      RealD                _zolo_hi;
+      RealD                _b;
+      RealD                _c;
+
   // Cayley form Moebius (tanh and zolotarev)
   std::vector<Coeff_t> omega;
   std::vector<Coeff_t> bs;    // S dependent coeffs
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
similarity index 100%
rename from lib/qcd/action/fermion/CayleyFermion5Dcache.cc
rename to Grid/qcd/action/fermion/CayleyFermion5Dcache.cc
diff --git a/lib/qcd/action/fermion/CayleyFermion5Ddense.cc b/Grid/qcd/action/fermion/CayleyFermion5Ddense.cc
similarity index 100%
rename from lib/qcd/action/fermion/CayleyFermion5Ddense.cc
rename to Grid/qcd/action/fermion/CayleyFermion5Ddense.cc
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc b/Grid/qcd/action/fermion/CayleyFermion5Dssp.cc
similarity index 100%
rename from lib/qcd/action/fermion/CayleyFermion5Dssp.cc
rename to Grid/qcd/action/fermion/CayleyFermion5Dssp.cc
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
similarity index 99%
rename from lib/qcd/action/fermion/CayleyFermion5Dvec.cc
rename to Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
index 82920e6d..95bd31bd 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/Grid/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -475,7 +475,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionF
 	  }
 	  a0 = a0+incr;
 	  a1 = a1+incr;
-	  a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
 	}}
       {
 	int lexa = s1+LLs*site;
@@ -712,7 +712,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, Fermion
 	  }
 	  a0 = a0+incr;
 	  a1 = a1+incr;
-	  a2 = a2+sizeof(Simd::scalar_type);
+	a2 = a2+sizeof(typename Simd::scalar_type);
 	}}
       {
 	int lexa = s1+LLs*site;
diff --git a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
similarity index 91%
rename from lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
rename to Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
index 93701f84..5fb80216 100644
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.cc
@@ -294,6 +294,27 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
   assert((Ls&0x1)==1); // Odd Ls required
 }
 
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
 FermOpTemplateInstantiate(ContinuedFractionFermion5D);
 
 NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
similarity index 85%
rename from lib/qcd/action/fermion/ContinuedFractionFermion5D.h
rename to Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
index 43ed4840..0e0c1d75 100644
--- a/lib/qcd/action/fermion/ContinuedFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h
@@ -63,6 +63,14 @@ public:
   // Efficient support for multigrid coarsening
   virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
 
+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
+      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+
   // Constructors
   ContinuedFractionFermion5D(GaugeField &_Umu,
 			     GridCartesian         &FiveDimGrid,
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermion.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
similarity index 100%
rename from lib/qcd/action/fermion/DomainWallEOFAFermion.cc
rename to Grid/qcd/action/fermion/DomainWallEOFAFermion.cc
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/DomainWallEOFAFermion.h
rename to Grid/qcd/action/fermion/DomainWallEOFAFermion.h
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
similarity index 100%
rename from lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
rename to Grid/qcd/action/fermion/DomainWallEOFAFermioncache.cc
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
similarity index 100%
rename from lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
rename to Grid/qcd/action/fermion/DomainWallEOFAFermiondense.cc
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
similarity index 100%
rename from lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
rename to Grid/qcd/action/fermion/DomainWallEOFAFermionssp.cc
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
similarity index 99%
rename from lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
rename to Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
index 3d20befe..43fa16ec 100644
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -484,7 +484,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, Fe
 	  }
 	  a0 = a0 + incr;
 	  a1 = a1 + incr;
-	  a2 = a2 + sizeof(Simd::scalar_type);
+                        a2 = a2 + sizeof(typename Simd::scalar_type);
 	}
       }
 
diff --git a/lib/qcd/action/fermion/DomainWallFermion.h b/Grid/qcd/action/fermion/DomainWallFermion.h
similarity index 52%
rename from lib/qcd/action/fermion/DomainWallFermion.h
rename to Grid/qcd/action/fermion/DomainWallFermion.h
index 16e1630b..8758d6e9 100644
--- a/lib/qcd/action/fermion/DomainWallFermion.h
+++ b/Grid/qcd/action/fermion/DomainWallFermion.h
@@ -8,6 +8,7 @@
 
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -40,8 +41,58 @@ public:
   INHERIT_IMPL_TYPES(Impl);
 public:
 
-  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m) { 
-    this->MomentumSpacePropagatorHt(out,in,_m);
+      void FreePropagator(const FermionField &in,FermionField &out,RealD mass, std::vector<double> twist, bool fiveD) {
+	FermionField in_k(in.Grid());
+	FermionField prop_k(in.Grid());
+
+	FFT theFFT((GridCartesian *) in.Grid());
+
+	//phase for boundary condition
+	ComplexField coor(in.Grid());
+	ComplexField ph(in.Grid());  ph = Zero();
+	FermionField in_buf(in.Grid()); in_buf = Zero();
+	Complex ci(0.0,1.0);
+	assert(twist.size() == Nd);//check that twist is Nd
+	int shift = 0;
+	if(fiveD) shift = 1;
+	for(unsigned int nu = 0; nu < Nd; nu++)
+	{
+	  // Shift coordinate lattice index by 1 to account for 5th dimension.
+          LatticeCoordinate(coor, nu + shift);
+	  ph = ph + twist[nu]*coor*((1./(in.Grid()->FullDimensions()[nu+shift])));
+	}
+	in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
+
+	if(fiveD){//FFT only on temporal and spatial dimensions
+          std::vector<int> mask(Nd+1,1); mask[0] = 0;
+	  theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward);
+          this->MomentumSpacePropagatorHt_5d(prop_k,in_k,mass,twist);
+          theFFT.FFT_dim_mask(out,prop_k,mask,FFT::backward);
+        }
+	else{
+	  theFFT.FFT_all_dim(in_k,in,FFT::forward);
+          this->MomentumSpacePropagatorHt(prop_k,in_k,mass,twist);
+	  theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+        }
+
+	//phase for boundary condition
+	out = out * exp((Real)(2.0*M_PI)*ci*ph);
+      };
+
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
+        bool fiveD = true; //5d propagator by default
+        FreePropagator(in,out,mass,twist,fiveD);
+      };
+
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass, bool fiveD) {
+	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+        FreePropagator(in,out,mass,twist,fiveD);
+      };
+
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+        bool fiveD = true; //5d propagator by default
+	std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+        FreePropagator(in,out,mass,twist,fiveD);
   };
 
   virtual void   Instantiatable(void) {};
diff --git a/lib/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h
similarity index 86%
rename from lib/qcd/action/fermion/Fermion.h
rename to Grid/qcd/action/fermion/Fermion.h
index 05f079e8..d1b14923 100644
--- a/lib/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -25,8 +25,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef  GRID_QCD_FERMION_H
-#define  GRID_QCD_FERMION_H
+#pragma once
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Explicit explicit template instantiation is still required in the .cc files
@@ -50,17 +49,26 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 ////////////////////////////////////////////
 
 #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
+NAMESPACE_CHECK(Wilson);
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
+NAMESPACE_CHECK(WilsonTM);
+#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
+NAMESPACE_CHECK(WilsonClover);
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
-//#include <Grid/qcd/action/fermion/CloverFermion.h>
+NAMESPACE_CHECK(Wilson5D);
+
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
+NAMESPACE_CHECK(Staggered);
+
 #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
+NAMESPACE_CHECK(DomainWall);
+
 #include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
@@ -73,17 +81,32 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
+NAMESPACE_CHECK(Overlap);
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/g5HermitianLinop.h>
 
+///////////////////////////////////////////////////////////////////////////////
+// Fourier accelerated Pauli Villars inverse support
+///////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>   
+NAMESPACE_CHECK(WilsonTM5);
+
+////////////////////////////////////////////////////////////////////////////////
+// Move this group to a DWF specific tools/algorithms subdir? 
+////////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
+#include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
+#include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
+#include <Grid/qcd/action/fermion/MADWF.h>
+NAMESPACE_CHECK(DWFutils);
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 // are added, (e.g. extension for gparity, half precision project in comms etc..)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
 // Cayley 5d
 NAMESPACE_BEGIN(Grid);
 
@@ -103,10 +126,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
+typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
+
+// Twisted mass fermion
 typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 
+// Clover fermions
+typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
+typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
+typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
+
+typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
+typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
+typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
+
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
+
+// Domain Wall fermions
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
@@ -283,8 +329,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
-
-
 NAMESPACE_END(Grid);
 
 ////////////////////
@@ -294,4 +338,4 @@ NAMESPACE_END(Grid);
 #include <Grid/qcd/action/scalar/Scalar.h>
 #include <Grid/qcd/action/gauge/Photon.h>
 
-#endif
+
diff --git a/lib/qcd/action/fermion/FermionCore.h b/Grid/qcd/action/fermion/FermionCore.h
similarity index 93%
rename from lib/qcd/action/fermion/FermionCore.h
rename to Grid/qcd/action/fermion/FermionCore.h
index 5131febb..20a8fae9 100644
--- a/lib/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -36,10 +36,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 // Fermion prereqs
 ////////////////////////////////////////////
 #include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions
+NAMESPACE_CHECK(Compressor);
 #include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
 #include <Grid/qcd/action/fermion/FermionOperator.h>
+NAMESPACE_CHECK(FermionOperator);
 #include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
+NAMESPACE_CHECK(Kernels);
 
 #define FermOpStaggeredTemplateInstantiate(A) \
   template class A<StaggeredImplF>; \
@@ -72,7 +75,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 
 #define TwoIndexFermOpTemplateInstantiate(A) \
   template class A<WilsonTwoIndexSymmetricImplF>; \
-  template class A<WilsonTwoIndexSymmetricImplD>; 
+  template class A<WilsonTwoIndexSymmetricImplD>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
+  template class A<WilsonTwoIndexAntiSymmetricImplD>;
 
 #define FermOp5dVecTemplateInstantiate(A) \
   template class A<DomainWallVec5dImplF>;	\
diff --git a/lib/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h
similarity index 70%
rename from lib/qcd/action/fermion/FermionOperator.h
rename to Grid/qcd/action/fermion/FermionOperator.h
index 9ff83594..7c64ea35 100644
--- a/lib/qcd/action/fermion/FermionOperator.h
+++ b/Grid/qcd/action/fermion/FermionOperator.h
@@ -9,6 +9,7 @@
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -62,8 +63,6 @@ public:
   virtual RealD  Mdag (const FermionField &in, FermionField &out)=0;
 
   // half checkerboard operaions
-  virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
-
   virtual void   Meooe       (const FermionField &in, FermionField &out)=0;
   virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0;
   virtual void   Mooee       (const FermionField &in, FermionField &out)=0;
@@ -93,17 +92,39 @@ public:
   virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 
 
-  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m) { assert(0);};
+      virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
 
-  virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass) { 
-    FFT theFFT((GridCartesian *) in.Grid());
+      virtual void  FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) 
+      {
+	FFT theFFT((GridCartesian *) in.Grid());
 
-    FermionField in_k(in.Grid());
-    FermionField prop_k(in.Grid());
+	FermionField in_k(in.Grid());
+	FermionField prop_k(in.Grid());
 
-    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-    this->MomentumSpacePropagator(prop_k,in_k,mass);
+	//phase for boundary condition
+	ComplexField coor(in.Grid());
+	ComplexField ph(in.Grid());  ph = Zero();
+	FermionField in_buf(in.Grid()); in_buf = Zero();
+	Complex ci(0.0,1.0);
+	assert(twist.size() == Nd);//check that twist is Nd
+	for(unsigned int nu = 0; nu < Nd; nu++)
+	{
+          LatticeCoordinate(coor, nu);
+	  ph = ph + twist[nu]*coor*((1./(in.Grid()->_fdimensions[nu])));
+	}
+	in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
+
+	theFFT.FFT_all_dim(in_k,in_buf,FFT::forward);
+        this->MomentumSpacePropagator(prop_k,in_k,mass,twist);
     theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+
+	//phase for boundary condition
+	out = out * exp((Real)(2.0*M_PI)*ci*ph);
+
+      };
+      virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
+		std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
+	        FreePropagator(in,out,mass,twist);
   };
 
   ///////////////////////////////////////////////
@@ -123,9 +144,30 @@ public:
 				   PropagatorField &q_out,
 				   Current curr_type,
 				   unsigned int mu,
-				   std::vector<Real> mom,
 				   unsigned int tmin, 
-				   unsigned int tmax)=0;
+                                       unsigned int tmax,
+                                       ComplexField &lattice_cmplx)=0;
+      ///////////////////////////////////////////////
+      // Physical field import/export
+      ///////////////////////////////////////////////
+      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
+      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
+      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
+      {
+	imported = input;
+      };
+      virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported)
+      {
+	imported=input;
+      };
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported)
+      {
+	exported=solution;
+      };
+      virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported)
+      {
+	exported=solution;
+      };
 };
 
 NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/Grid/qcd/action/fermion/FermionOperatorImpl.h
similarity index 83%
rename from lib/qcd/action/fermion/FermionOperatorImpl.h
rename to Grid/qcd/action/fermion/FermionOperatorImpl.h
index 44704310..e437b027 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -141,6 +141,7 @@ public:
   
 #define INHERIT_FIMPL_TYPES(Impl)\
   typedef typename Impl::Coeff_t                     Coeff_t;           \
+  typedef Impl Impl_t;							\
   typedef typename Impl::FermionField           FermionField;		\
   typedef typename Impl::PropagatorField     PropagatorField;		\
   typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
@@ -167,6 +168,7 @@ class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Di
 public:
 
   static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
   static const bool LsVectorised=false;
   static const int Nhcs = Options::Nhcs;
 
@@ -271,16 +273,30 @@ public:
     GaugeLinkField tmp(GaugeGrid);
 
     Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
     for (int mu = 0; mu < Nd; mu++) {
 
+	////////// boundary phase /////////////
       auto pha = Params.boundary_phases[mu];
       scalar_type phase( real(pha),imag(pha) );
 
-      int Lmu = GaugeGrid->GlobalDimensions()[mu] - 1;
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;
 
       LatticeCoordinate(coor, mu);
 
       U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
       tmp = where(coor == Lmu, phase * U, U);
       PokeIndex<LorentzIndex>(Uds, tmp, mu);
 
@@ -296,6 +312,20 @@ public:
     PokeIndex<LorentzIndex>(mat,link,mu);
   }   
       
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
+      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
       
     int Ls=Btilde.Grid()->_fdimensions[0];
@@ -319,27 +349,28 @@ public:
 ////////////////////////////////////////////////////////////////////////////////////
 // Single flavour four spinors with colour index, 5d redblack
 ////////////////////////////////////////////////////////////////////////////////////
-template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
-class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
 public:
 
-  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
   INHERIT_GIMPL_TYPES(Gimpl);
 
-  static const int Dimension = Nrepresentation;
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
   static const bool LsVectorised=true;
   static const int Nhcs = Options::Nhcs;
       
   typedef typename Options::_Coeff_t Coeff_t;      
   typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
   
-  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
-  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
-  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
-  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
-  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
   
   typedef iImplSpinor<Simd>            SiteSpinor;
   typedef iImplPropagator<Simd>        SitePropagator;
@@ -376,8 +407,8 @@ public:
 					  StencilView &St) 
   {
     SiteGaugeLink UU;
-    for (int i = 0; i < Nrepresentation; i++) {
-      for (int j = 0; j < Nrepresentation; j++) {
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
         vsplat(UU()()(i, j), U(mu)()(i, j));
       }
     }
@@ -430,8 +461,8 @@ public:
 					      const SitePropagator &chi,int mu) 
   {
     SiteGaugeLink UU;
-    for (int i = 0; i < Nrepresentation; i++) {
-      for (int j = 0; j < Nrepresentation; j++) {
+    for (int i = 0; i < Dimension; i++) {
+      for (int j = 0; j < Dimension; j++) {
         vsplat(UU()()(i, j), U(mu)()(i, j));
       }
     }
@@ -470,6 +501,19 @@ public:
     assert(0);
   }
 
+  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+    assert(0);
+  } 
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+
+
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
 
     assert(0);
@@ -521,25 +565,26 @@ public:
 ////////////////////////////////////////////////////////////////////////////////////////
 // Flavour doubled spinors; is Gparity the only? what about C*?
 ////////////////////////////////////////////////////////////////////////////////////////
-template <class S, int Nrepresentation, class Options=CoeffReal>
-class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 public:
 
-  static const int Dimension = Nrepresentation;
+ static const int Dimension = Representation::Dimension;
+ static const bool isFundamental = Representation::isFundamental;
   static const int Nhcs = Options::Nhcs;
   static const bool LsVectorised=false;
 
-  typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
   INHERIT_GIMPL_TYPES(Gimpl);
 
   typedef typename Options::_Coeff_t Coeff_t;
   typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
       
-  template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>,   Ngp>;
-  template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>,   Ngp>;
-  template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>,  Ngp>;
-  template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
-  template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
+ template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
 
   typedef iImplSpinor<Simd>            SiteSpinor;
   typedef iImplPropagator<Simd>        SitePropagator;
@@ -721,6 +766,25 @@ public:
     return;
   }
       
+ inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
+   //mat = outerProduct(Btilde, A);
+   assert(0);
+  }
+
+  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+    assert(0);
+    /*
+    auto tmp = TraceIndex<SpinIndex>(P);
+    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
+    }
+    */
+  }
+
+  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    assert(0);
+  }
+  
   inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
 
     int Ls = Btilde.Grid()->_fdimensions[0];
@@ -753,6 +817,7 @@ public:
 
   typedef RealD  _Coeff_t ;
   static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
   static const bool LsVectorised=false;
   typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
       
@@ -807,6 +872,11 @@ public:
     reg = memory;
   }
       
+    inline void InsertGaugeField(DoubledGaugeField &U_ds,
+				 const GaugeLinkField &U,int mu)
+    {
+      PokeIndex<LorentzIndex>(U_ds, U, mu);
+    }
   inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &UUUds, // for Naik term
 			  DoubledGaugeField &Uds,
@@ -845,8 +915,10 @@ public:
       U    = U    *phases;
       Udag = Udag *phases;
 
-      PokeIndex<LorentzIndex>(Uds, U, mu);
-      PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
+	InsertGaugeField(Uds,U,mu);
+	InsertGaugeField(Uds,Udag,mu+4);
+	//	PokeIndex<LorentzIndex>(Uds, U, mu);
+	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
 
       // 3 hop based on thin links. Crazy huh ?
       U  = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -858,8 +930,8 @@ public:
       UUU    = UUU    *phases;
       UUUdag = UUUdag *phases;
 
-      PokeIndex<LorentzIndex>(UUUds, UUU, mu);
-      PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4);
+	InsertGaugeField(UUUds,UUU,mu);
+	InsertGaugeField(UUUds,UUUdag,mu+4);
 
     }
   }
@@ -885,6 +957,7 @@ class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representa
 public:
 
   static const int Dimension = Representation::Dimension;
+    static const bool isFundamental = Representation::isFundamental;
   static const bool LsVectorised=true;
   typedef RealD   Coeff_t ;
   typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -959,6 +1032,23 @@ public:
     mac(&phi(), &UU(), &chi());
   }
       
+  inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
+  {
+    GridBase *GaugeGrid = U_ds.Grid();
+    thread_loop( (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++), {
+
+	SiteScalarGaugeLink   ScalarU;
+	SiteDoubledGaugeField ScalarUds;
+	
+	Coordinate lcoor;
+	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+	peekLocalSite(ScalarUds, U_ds, lcoor);
+	
+	peekLocalSite(ScalarU, U, lcoor);
+	ScalarUds(mu) = ScalarU();
+	
+    });
+  }
   inline void DoubleStore(GridBase *GaugeGrid,
 			  DoubledGaugeField &UUUds, // for Naik term
 			  DoubledGaugeField &Uds,
@@ -1000,23 +1090,8 @@ public:
       U    = U    *phases;
       Udag = Udag *phases;
 
-
-      for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
-	SiteScalarGaugeLink   ScalarU;
-	SiteDoubledGaugeField ScalarUds;
-	  
-	Coordinate lcoor;
-	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
-	peekLocalSite(ScalarUds, Uds, lcoor);
-
-	peekLocalSite(ScalarU, U, lcoor);
-	ScalarUds(mu) = ScalarU();
-
-	peekLocalSite(ScalarU, Udag, lcoor);
-	ScalarUds(mu + 4) = ScalarU();
-
-	pokeLocalSite(ScalarUds, Uds, lcoor);
-      }
+      InsertGaugeField(Uds,U,mu);
+      InsertGaugeField(Uds,Udag,mu+4);
 
       // 3 hop based on thin links. Crazy huh ?
       U  = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -1028,24 +1103,8 @@ public:
       UUU    = UUU    *phases;
       UUUdag = UUUdag *phases;
 
-      for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
-
-	SiteScalarGaugeLink  ScalarU;
-	SiteDoubledGaugeField ScalarUds;
-	  
-	Coordinate lcoor;
-	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
-      
-	peekLocalSite(ScalarUds, UUUds, lcoor);
-
-	peekLocalSite(ScalarU, UUU, lcoor);
-	ScalarUds(mu) = ScalarU();
-
-	peekLocalSite(ScalarU, UUUdag, lcoor);
-	ScalarUds(mu + 4) = ScalarU();
-	  
-	pokeLocalSite(ScalarUds, UUUds, lcoor);
-      }
+      InsertGaugeField(UUUds,UUU,mu);
+      InsertGaugeField(UUUds,UUUdag,mu+4);
 
     }
   }
@@ -1083,29 +1142,33 @@ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > Wilso
 typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
 typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
  
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
+typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
+typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
+typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
+
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
  
-typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
  
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
  
-typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
-typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
-typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
+typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
+typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
+typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
  
-typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD;  // Double
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
  
-typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
-typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
-typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
+typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
+typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
+typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 
 typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
diff --git a/Grid/qcd/action/fermion/FourierAcceleratedPV.h b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
new file mode 100644
index 00000000..fd5010ce
--- /dev/null
+++ b/Grid/qcd/action/fermion/FourierAcceleratedPV.h
@@ -0,0 +1,238 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/FourierAcceleratedPV.h
+
+    Copyright (C) 2015
+
+Author: Christoph Lehner (lifted with permission by Peter Boyle, brought back to Grid)
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+  template<typename M>
+    void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
+    ComplexD b,c;
+    b=m.bs[0];
+    c=m.cs[0];
+    std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
+    for (size_t i=1;i<m.bs.size();i++) {
+      assert(m.bs[i] == b);
+      assert(m.cs[i] == c);
+    }
+    assert(b.imag() == 0.0);
+    assert(c.imag() == 0.0);
+    _b = b.real();
+    _c = c.real();
+  }
+
+
+template<typename Vi, typename M, typename G>
+class FourierAcceleratedPV {
+ public:
+
+  ConjugateGradient<Vi> &cg;
+  M& dwfPV;
+  G& Umu;
+  GridCartesian* grid5D;
+  GridRedBlackCartesian* gridRB5D;
+  int group_in_s;
+
+  FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2) 
+   : dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s) 
+  {
+    assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
+    grid5D   = SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
+    gridRB5D = SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu.Grid());
+  }
+
+  void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
+
+    GridStopWatch gsw1, gsw2;
+
+    typedef typename Vi::scalar_type Coeff_t;
+    int Ls = dst.Grid()->_fdimensions[0];
+
+    Vi _tmp(dst.Grid());
+    double phase = M_PI / (double)Ls;
+    Coeff_t bzero(0.0,0.0);
+
+    FFT theFFT((GridCartesian*)dst.Grid());
+
+    if (!forward) {
+      gsw1.Start();
+      for (int s=0;s<Ls;s++) {
+	Coeff_t a(::cos(phase*s),-::sin(phase*s));
+	axpby_ssp(_tmp,a,_src,bzero,_src,s,s);
+      }
+      gsw1.Stop();
+
+      gsw2.Start();
+      theFFT.FFT_dim(dst,_tmp,0,FFT::forward);
+      gsw2.Stop();
+
+    } else {
+
+      gsw2.Start();
+      theFFT.FFT_dim(_tmp,_src,0,FFT::backward);
+      gsw2.Stop();
+
+      gsw1.Start();
+      for (int s=0;s<Ls;s++) {
+	Coeff_t a(::cos(phase*s),::sin(phase*s));
+	axpby_ssp(dst,a,_tmp,bzero,_tmp,s,s);
+      }
+      gsw1.Stop();
+    }
+
+    std::cout << GridLogMessage << "Timing rotatePV: " << gsw1.Elapsed() << ", " << gsw2.Elapsed() << std::endl;
+
+  }
+
+  void pvInv(const Vi& _src, Vi& _dst) const {
+
+    std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;
+
+    typedef typename Vi::scalar_type Coeff_t;
+    int Ls = _dst.Grid()->_fdimensions[0];
+
+    GridStopWatch gswT;
+    gswT.Start();
+
+    RealD b,c;
+    get_real_const_bc(dwfPV,b,c);
+    RealD M5 = dwfPV.M5;
+    
+    // U(true) Rightinv TMinv U(false) = Minv
+
+    Vi _src_diag(_dst.Grid());
+    Vi _src_diag_slice(dwfPV.GaugeGrid());
+    Vi _dst_diag_slice(dwfPV.GaugeGrid());
+    Vi _src_diag_slices(grid5D);
+    Vi _dst_diag_slices(grid5D);
+    Vi _dst_diag(_dst.Grid());
+
+    rotatePV(_src,_src_diag,false);
+
+    // now do TM solves
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    GridStopWatch gswA, gswB;
+
+    gswA.Start();
+
+    typedef typename M::Impl_t Impl;
+    //WilsonTMFermion<Impl> tm(x.Umu,*x.UGridF,*x.UrbGridF,0.0,0.0,solver_outer.parent.par.wparams_f);
+    std::vector<RealD> vmass(grid5D->_fdimensions[0],0.0);
+    std::vector<RealD> vmu(grid5D->_fdimensions[0],0.0);
+
+    WilsonTMFermion5D<Impl> tm(Umu,*grid5D,*gridRB5D,
+			   *(GridCartesian*)dwfPV.GaugeGrid(),
+			   *(GridRedBlackCartesian*)dwfPV.GaugeRedBlackGrid(),
+			   vmass,vmu);
+    
+    //SchurRedBlackDiagTwoSolve<Vi> sol(cg);
+    SchurRedBlackDiagMooeeSolve<Vi> sol(cg); // same performance as DiagTwo
+    gswA.Stop();
+
+    gswB.Start();
+
+    for (int sgroup=0;sgroup<Ls/2/group_in_s;sgroup++) {
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
+	RealD cosp = ::cos(phase);
+	RealD sinp = ::sin(phase);
+	RealD denom = b*b + c*c + 2.0*b*c*cosp;
+	RealD mass = -(b*b*M5 + c*(1.0 - cosp + c*M5) + b*(-1.0 + cosp + 2.0*c*cosp*M5))/denom;
+	RealD mu = (b+c)*sinp/denom;
+
+	vmass[2*sidx + 0] = mass;
+	vmass[2*sidx + 1] = mass;
+	vmu[2*sidx + 0] = mu;
+	vmu[2*sidx + 1] = -mu;
+
+      }
+
+      tm.update(vmass,vmu);
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	ExtractSlice(_src_diag_slice,_src_diag,s,0);
+	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 0,0);
+
+	ExtractSlice(_src_diag_slice,_src_diag,sprime,0);
+	InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 1,0);
+
+      }
+
+      GridStopWatch gsw;
+      gsw.Start();
+      _dst_diag_slices = Zero(); // zero guess
+      sol(tm,_src_diag_slices,_dst_diag_slices);
+      gsw.Stop();
+      std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
+
+      for (int sidx=0;sidx<group_in_s;sidx++) {
+
+	int s = sgroup*group_in_s + sidx;
+	int sprime = Ls-s-1;
+
+	RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
+	RealD cosp = ::cos(phase);
+	RealD sinp = ::sin(phase);
+
+	// now rotate with inverse of
+	Coeff_t pA = b + c*cosp;
+	Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
+	Coeff_t pABden = pA*pA - pB*pB;
+	// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
+      
+	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 0,0);
+	_dst_diag_slice = (pA/pABden) * _dst_diag_slice - (pB/pABden) * (G5 * _dst_diag_slice);
+	InsertSlice(_dst_diag_slice,_dst_diag,s,0);
+	
+	ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 1,0);
+	_dst_diag_slice = (pA/pABden) * _dst_diag_slice + (pB/pABden) * (G5 * _dst_diag_slice);
+	InsertSlice(_dst_diag_slice,_dst_diag,sprime,0);
+      }
+    }
+    gswB.Stop();
+
+    rotatePV(_dst_diag,_dst,true);
+
+    gswT.Stop();
+    std::cout << GridLogMessage << "PV completed in " << gswT.Elapsed() << " (Setup: " << gswA.Elapsed() << ", s-loop: " << gswB.Elapsed() << ")" << std::endl;
+  }
+
+};
+NAMESPACE_END(Grid);
+
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
similarity index 64%
rename from lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
rename to Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
index 05be76b0..4e891b55 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -43,6 +43,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
 template <class Impl>
 ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, 
 							 RealD _mass,
+							 RealD _c1, RealD _c2,RealD _u0,
 							 const ImplParams &p)
   : Kernels(p),
     _grid(&Fgrid),
@@ -61,6 +62,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
     UUUmuOdd(&Hgrid) ,
     _tmp(&Hgrid)
 {
+  int vol4;
+  int LLs=1;
+  c1=_c1;
+  c2=_c2;
+  u0=_u0;
+  vol4= _grid->oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
+  vol4= _cbgrid->oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
 }
 
 template <class Impl>
@@ -68,22 +79,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
 							 GridRedBlackCartesian &Hgrid, RealD _mass,
 							 RealD _c1, RealD _c2,RealD _u0,
 							 const ImplParams &p)
-  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
+  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
 {
-  c1=_c1;
-  c2=_c2;
-  u0=_u0;
   ImportGauge(_Uthin,_Ufat);
 }
-template <class Impl>
-ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
-							 GridRedBlackCartesian &Hgrid, RealD _mass,
-							 const ImplParams &p)
-  : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
-{
-  ImportGaugeSimple(_Utriple,_Ufat);
-}
-
 
 ////////////////////////////////////////////////////////////
 // Momentum space propagator should be 
@@ -97,11 +96,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug
 // of above link to implmement fourier based solver.
 ////////////////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin) 
-{
-  ImportGauge(_Uthin,_Uthin);
-};
-template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
 {
   /////////////////////////////////////////////////////////////////
@@ -124,6 +118,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
     PokeIndex<LorentzIndex>(Umu, -U, mu+4);
 
   }
+  CopyGaugeCheckerboards();
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
+{
+
+  Umu   = _U;
+  UUUmu = _UUU;
+  CopyGaugeCheckerboards();
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
+{
   pickCheckerboard(Even, UmuEven,  Umu);
   pickCheckerboard(Odd,  UmuOdd ,  Umu);
   pickCheckerboard(Even, UUUmuEven,UUUmu);
@@ -159,10 +167,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
   }
 
-  pickCheckerboard(Even, UmuEven, Umu);
-  pickCheckerboard(Odd,  UmuOdd , Umu);
-  pickCheckerboard(Even, UUUmuEven, UUUmu);
-  pickCheckerboard(Odd,   UUUmuOdd, UUUmu);
+  CopyGaugeCheckerboards();
 }
 
 /////////////////////////////
@@ -254,7 +259,7 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
     auto B_v   = B.View();
     auto Btilde_v   = Btilde.View();
     thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++), {
-      Kernels::DhopDirK(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
+      Kernels::DhopDir(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
     });
 
     // Force in three link terms
@@ -323,7 +328,9 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=2;
   conformable(in.Grid(), _grid);  // verifies full grid
   conformable(in.Grid(), out.Grid());
 
@@ -333,7 +340,9 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
@@ -344,7 +353,9 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
 }
 
 template <class Impl>
-void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
+void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) 
+{
+  DhopCalls+=1;
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
@@ -369,7 +380,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
   auto in_v    =  in.View();
   auto out_v   = out.View();
   thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) , {
-      Kernels::DhopDirK(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
+      Kernels::DhopDir(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
   });
 };
 
@@ -378,16 +389,139 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
 						  DoubledGaugeField &U,
 						  DoubledGaugeField &UUU,
 						  const FermionField &in,
-						  FermionField &out, int dag) {
+						  FermionField &out, int dag) 
+{
+#ifdef GRID_OMP
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+  else
+#endif
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+}
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+								 DoubledGaugeField &U,
+								 DoubledGaugeField &UUU,
+								 const FermionField &in,
+								 FermionField &out, int dag) 
+{
+#ifdef GRID_OMP
+  Compressor compressor; 
+  int len =  U.Grid()->oSites();
+  const int LLs =  1;
+
+  DhopTotalTime   -= usecond();
+
+  DhopFaceTime    -= usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommsMergeSHM(compressor);
+  DhopFaceTime    += usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  DhopComputeTime    -= usecond();
+#pragma omp parallel 
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+
+    if (tid >= ncomms) {
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = len;
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+
+      // do the compute
+      auto U_v   = U.View();
+      auto UUU_v = UUU.View();
+      auto in_v  = in.View();
+      auto out_v = out.View();
+      if (dag == DaggerYes) {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+          int sU = ss;
+	  // Interior = 1; Exterior = 0; must implement for staggered
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); 
+        }
+      } else {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  // Interior = 1; Exterior = 0;
+          int sU = ss;
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
+        }
+      }
+    } else {
+      st.CommunicateThreaded();
+    }
+  }
+  DhopComputeTime    += usecond();
+
+  // First to enter, last to leave timing
+  DhopFaceTime    -= usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime    -= usecond();
+
+  DhopComputeTime2    -= usecond();
+  {
+    auto U_v   = U.View();
+    auto UUU_v = UUU.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    if (dag == DaggerYes) {
+      int sz=st.surface_list.size();
+      parallel_for (int ss = 0; ss < sz; ss++) {
+	int sU = st.surface_list[ss];
+	Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      }
+    } else {
+      int sz=st.surface_list.size();
+      parallel_for (int ss = 0; ss < sz; ss++) {
+	int sU = st.surface_list[ss];
+	Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
+      }
+    }
+  }
+  DhopComputeTime2    += usecond();
+#else
+  assert(0);
+#endif
+}
+
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
+							     DoubledGaugeField &U,
+							     DoubledGaugeField &UUU,
+							     const FermionField &in,
+							     FermionField &out, int dag) 
+{
   assert((dag == DaggerNo) || (dag == DaggerYes));
 
+  DhopTotalTime   -= usecond();
+
+  DhopCommTime    -= usecond();
   Compressor compressor;
   st.HaloExchange(in, compressor);
+  DhopCommTime    += usecond();
 
   auto U_v   =   U.View();
   auto UUU_v = UUU.View();
   auto in_v  =  in.View();
   auto out_v = out.View();
+  DhopComputeTime -= usecond();
   if (dag == DaggerYes) {
     thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
       Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
@@ -397,8 +531,65 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
       Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
     });
   }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
 };
 
+  ////////////////////////////////////////////////////////////////
+  // Reporting
+  ////////////////////////////////////////////////////////////////
+template<class Impl>
+void ImprovedStaggeredFermion<Impl>::Report(void) 
+{
+  Coordinate latt = _grid->GlobalDimensions();
+  RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _grid->_Nprocessors;
+  RealD NN = _grid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _grid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime   = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}
+
+
 //////////////////////////////////////////////////////// 
 // Conserved current - not yet implemented.
 ////////////////////////////////////////////////////////
@@ -417,13 +608,15 @@ void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                          PropagatorField &q_out,
                                                          Current curr_type,
                                                          unsigned int mu, 
-                                                         std::vector<Real> mom,
                                                          unsigned int tmin,
-                                                         unsigned int tmax)
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
 {
   assert(0);
+
 }
 
+
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
 
 //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
similarity index 79%
rename from lib/qcd/action/fermion/ImprovedStaggeredFermion.h
rename to Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
index ebc36a8b..b4d8d60b 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -47,6 +47,18 @@ public:
   FermionField _tmp;
   FermionField &tmp(void) { return _tmp; }
 
+  ////////////////////////////////////////
+  // Performance monitoring
+  ////////////////////////////////////////
+  void Report(void);
+  void ZeroCounters(void);
+  double DhopTotalTime;
+  double DhopCalls;
+  double DhopCommTime;
+  double DhopComputeTime;
+  double DhopComputeTime2;
+  double DhopFaceTime;
+
   ///////////////////////////////////////////////////////////////
   // Implement the abstract base
   ///////////////////////////////////////////////////////////////
@@ -103,25 +115,34 @@ public:
 
   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
                     const FermionField &in, FermionField &out, int dag);
+  void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+                    const FermionField &in, FermionField &out, int dag);
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
+                    const FermionField &in, FermionField &out, int dag);
 
-  // Constructor
+  //////////////////////////////////////////////////////////////////////////
+  // Grid own interface Constructor
+  //////////////////////////////////////////////////////////////////////////
   ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
 			   GridRedBlackCartesian &Hgrid, RealD _mass,
-			   RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
-			   const ImplParams &p = ImplParams());
-
-  ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
-			   GridRedBlackCartesian &Hgrid, RealD _mass,
+			   RealD _c1, RealD _c2,RealD _u0,
 			   const ImplParams &p = ImplParams());
 
+  //////////////////////////////////////////////////////////////////////////
+  // MILC constructor no gauge fields
+  //////////////////////////////////////////////////////////////////////////
   ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
+			   RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
 			   const ImplParams &p = ImplParams());
 
-
   // DoubleStore impl dependent
-  void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
+  void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
   void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
-  void ImportGauge(const GaugeField &_Uthin);
+  void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U);
+  void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+  DoubledGaugeField &GetU(void)   { return Umu ; } ;
+  DoubledGaugeField &GetUUU(void) { return UUUmu; };
+  void CopyGaugeCheckerboards(void);
 
   ///////////////////////////////////////////////////////////////
   // Data members require to support the functionality
@@ -130,7 +151,8 @@ public:
   //    protected:
 public:
   // any other parameters of action ???
-
+  virtual int   isTrivialEE(void) { return 1; };
+  virtual RealD Mass(void) { return mass; }
   RealD mass;
   RealD u0;
   RealD c1;
@@ -168,9 +190,9 @@ public:
                            PropagatorField &q_out,
                            Current curr_type,
                            unsigned int mu, 
-                           std::vector<Real> mom,
                            unsigned int tmin,
-                           unsigned int tmax);
+                           unsigned int tmax,
+			   ComplexField &lattice_cmplx);
 };
 
 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
similarity index 65%
rename from lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
rename to Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
index f4387cb2..9ef16f4f 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -40,8 +40,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
 
 // 5d lattice for DWF.
 template<class Impl>
-ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
-							     GridCartesian         &FiveDimGrid,
+ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
 							     GridRedBlackCartesian &FiveDimRedBlackGrid,
 							     GridCartesian         &FourDimGrid,
 							     GridRedBlackCartesian &FourDimRedBlackGrid,
@@ -120,16 +119,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
     assert(FiveDimGrid._simd_layout[0]        ==1);
 
   }
+  int LLs = FiveDimGrid._rdimensions[0];
+  int vol4= FourDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol4);
 
-  // Allocate the required comms buffer
+  vol4=FourDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol4);
+  StencilOdd.BuildSurfaceList(LLs,vol4);
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
+{
+  pickCheckerboard(Even, UmuEven,  Umu);
+  pickCheckerboard(Odd,  UmuOdd ,  Umu);
+  pickCheckerboard(Even, UUUmuEven,UUUmu);
+  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+}
+template<class Impl>
+ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
+							     GridCartesian         &FiveDimGrid,
+							     GridRedBlackCartesian &FiveDimRedBlackGrid,
+							     GridCartesian         &FourDimGrid,
+							     GridRedBlackCartesian &FourDimRedBlackGrid,
+							     RealD _mass,
+							     RealD _c1,RealD _c2, RealD _u0,
+							     const ImplParams &p) :
+  ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
+			     FourDimGrid,FourDimRedBlackGrid,
+			     _mass,_c1,_c2,_u0,p)
+{
   ImportGauge(_Uthin,_Ufat);
 }
 
+///////////////////////////////////////////////////
+// For MILC use; pass three link U's and 1 link U
+///////////////////////////////////////////////////
 template <class Impl>
-void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin) 
+void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) 
 {
-  ImportGauge(_Uthin,_Uthin);
-};
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  for (int mu = 0; mu < Nd; mu++) {
+
+    auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
+    Impl::InsertGaugeField(UUUmu,U,mu);
+
+    U = adj( Cshift(U, mu, -3));
+    Impl::InsertGaugeField(UUUmu,-U,mu+4);
+
+    U = PeekIndex<LorentzIndex>(_Ufat, mu);
+    Impl::InsertGaugeField(Umu,U,mu);
+
+    U = adj( Cshift(U, mu, -1));
+    Impl::InsertGaugeField(Umu,-U,mu+4);
+
+  }
+  CopyGaugeCheckerboards();
+}
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) 
+{
+  /////////////////////////////////////////////////////////////////
+  // Trivial import; phases and fattening and such like preapplied
+  /////////////////////////////////////////////////////////////////
+  Umu   = _U;
+  UUUmu = _UUU;
+  CopyGaugeCheckerboards();
+}
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
 {
@@ -158,10 +215,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
   }
 
-  pickCheckerboard(Even, UmuEven, Umu);
-  pickCheckerboard(Odd,  UmuOdd , Umu);
-  pickCheckerboard(Even, UUUmuEven, UUUmu);
-  pickCheckerboard(Odd,  UUUmuOdd, UUUmu);
+  CopyGaugeCheckerboards();
 }
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
@@ -171,15 +225,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
 
   Compressor compressor;
   Stencil.HaloExchange(in,compressor);
-  auto Umu_v = Umu.View();
+  auto Umu_v   = Umu.View();
   auto UUUmu_v = UUUmu.View();
-  auto in_v  = in.View();
-  auto out_v  = in.View();
+  auto in_v    = in.View();
+  auto out_v   = out.View();
   thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
     for(int s=0;s<Ls;s++){
       int sU=ss;
       int sF = s+Ls*sU; 
-      Kernels::DhopDirK(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
+      Kernels::DhopDir(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
     }
   });
 };
@@ -225,6 +279,176 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
   assert(0);
 }
 
+/*CHANGE */
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+#ifdef GRID_OMP
+  if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
+  else
+#endif
+    DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
+								   DoubledGaugeField & U,DoubledGaugeField & UUU,
+								   const FermionField &in, FermionField &out,int dag)
+{
+#ifdef GRID_OMP
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  Compressor compressor; 
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();
+
+  DhopFaceTime-=usecond();
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+  DhopFaceTime+=usecond();
+
+  double ctime=0;
+  double ptime=0;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      double start = usecond();
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = U.Grid()->oSites(); // 4d vol
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+
+      // do the compute
+      auto   U_v  =   U.View();
+      auto UUU_v  = UUU.View();
+      auto  in_v  =  in.View();
+      auto out_v  = out.View();
+
+      if (dag == DaggerYes) {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+          int sU = ss;
+	  // Interior = 1; Exterior = 0; must implement for staggered
+          Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
+        }
+      } else {
+        for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  // Interior = 1; Exterior = 0;
+          int sU = ss;
+          Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
+        }
+      }
+        ptime = usecond() - start;
+    } else {
+      double start = usecond();
+      st.CommunicateThreaded();
+      ctime = usecond() - start;
+    }
+  }
+  DhopCommTime += ctime;
+  DhopComputeTime+=ptime;
+
+  // First to enter, last to leave timing
+  st.CollateThreads();
+
+  DhopFaceTime-=usecond();
+  st.CommsMerge(compressor);
+  DhopFaceTime+=usecond();
+
+  DhopComputeTime2-=usecond();
+
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
+  if (dag == DaggerYes) {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
+    }
+  } else {
+    int sz=st.surface_list.size();
+    parallel_for (int ss = 0; ss < sz; ss++) {
+      int sU = st.surface_list[ss];
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
+    }
+  }
+  DhopComputeTime2+=usecond();
+#else
+  assert(0);
+#endif
+
+}
+
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
+						    DoubledGaugeField & U,DoubledGaugeField & UUU,
+						    const FermionField &in, FermionField &out,int dag)
+{
+  Compressor compressor;
+  int LLs = in.Grid()->_rdimensions[0];
+
+
+
+ //double t1=usecond();
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
+  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
+  
+  DhopComputeTime -= usecond();
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  auto   U_v  =   U.View();
+  auto UUU_v  = UUU.View();
+  auto  in_v  =  in.View();
+  auto out_v  = out.View();
+  if (dag == DaggerYes) {
+    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+      int sU=ss;
+      Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
+    }
+  } else {
+    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+      int sU=ss;
+      Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
+    }
+  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
+ //double t2=usecond();
+ //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl;
+ //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl;
+
+}
+/*CHANGE END*/
+
+/* ORG
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 						    DoubledGaugeField & U,DoubledGaugeField & UUU,
@@ -258,6 +482,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
   DhopComputeTime += usecond();
   DhopTotalTime   += usecond();
 }
+*/
 
 
 template<class Impl>
@@ -340,6 +565,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
   DhopTotalTime    = 0;
   DhopCommTime    = 0;
   DhopComputeTime = 0;
+  DhopFaceTime    = 0;
+
+
   Stencil.ZeroCounters();
   StencilEven.ZeroCounters();
   StencilOdd.ZeroCounters();
@@ -427,11 +655,12 @@ void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in
 							   PropagatorField &q_out,
 							   Current curr_type,
 							   unsigned int mu, 
-							   std::vector<Real> mom,
 							   unsigned int tmin,
-							   unsigned int tmax)
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
 {
   assert(0);
+
 }
 
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
similarity index 74%
rename from lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
rename to Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
index 311d675b..6b1e0993 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -63,6 +63,8 @@ public:
   double DhopCalls;
   double DhopCommTime;
   double DhopComputeTime;
+      double DhopComputeTime2;
+      double DhopFaceTime;
 
   ///////////////////////////////////////////////////////////////
   // Implement the abstract base
@@ -118,7 +120,27 @@ public:
 		    FermionField &out,
 		    int dag);
     
+    void DhopInternalOverlappedComms(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      DoubledGaugeField &UUU,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+
+    void DhopInternalSerialComms(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      DoubledGaugeField &UUU,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
+    
+    
   // Constructors
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    // Grid internal interface -- Thin link and fat link, with coefficients
+    ////////////////////////////////////////////////////////////////////////////////////////////////
   ImprovedStaggeredFermion5D(GaugeField &_Uthin,
 			     GaugeField &_Ufat,
 			     GridCartesian         &FiveDimGrid,
@@ -126,18 +148,38 @@ public:
 			     GridCartesian         &FourDimGrid,
 			     GridRedBlackCartesian &FourDimRedBlackGrid,
 			     double _mass,
-			     RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
+			       RealD _c1, RealD _c2,RealD _u0,
+			       const ImplParams &p= ImplParams());
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
+    ////////////////////////////////////////////////////////////////////////////////////////////////
+    ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid,
+			       GridRedBlackCartesian &FiveDimRedBlackGrid,
+			       GridCartesian         &FourDimGrid,
+			       GridRedBlackCartesian &FourDimRedBlackGrid,
+			       double _mass,
+			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
 			     const ImplParams &p= ImplParams());
     
-  // DoubleStore
-  void ImportGauge(const GaugeField &_U);
+    // DoubleStore gauge field in operator
+    void ImportGauge      (const GaugeField &_Uthin ) { assert(0); }
   void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
+    void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
+    void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
+    // Give a reference; can be used to do an assignment or copy back out after import
+    // if Carleton wants to cache them and not use the ImportSimple
+    DoubledGaugeField &GetU(void)   { return Umu ; } ;
+    DoubledGaugeField &GetUUU(void) { return UUUmu; };
+    void CopyGaugeCheckerboards(void);
     
   ///////////////////////////////////////////////////////////////
   // Data members require to support the functionality
   ///////////////////////////////////////////////////////////////
 public:
     
+    virtual int   isTrivialEE(void) { return 1; };
+    virtual RealD Mass(void) { return mass; }
+    
   GridBase *_FourDimGrid;
   GridBase *_FourDimRedBlackGrid;
   GridBase *_FiveDimGrid;
@@ -181,9 +223,9 @@ public:
 			   PropagatorField &q_out,
 			   Current curr_type,
 			   unsigned int mu, 
-			   std::vector<Real> mom,
 			   unsigned int tmin,
-			   unsigned int tmax);
+                             unsigned int tmax,
+                 	     ComplexField &lattice_cmplx);
 };
 
 NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/MADWF.h b/Grid/qcd/action/fermion/MADWF.h
new file mode 100644
index 00000000..f7f0ee1b
--- /dev/null
+++ b/Grid/qcd/action/fermion/MADWF.h
@@ -0,0 +1,192 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/MADWF.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
+inline void convert(const Fieldi &from,Fieldo &to) 
+{
+  precisionChange(to,from);
+}
+template <class Fieldi, class Fieldo,IfSame<Fieldi,Fieldo> X=0>
+inline void convert(const Fieldi &from,Fieldo &to) 
+{
+  to=from;
+}
+
+template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser> 
+class MADWF 
+{
+ private:
+  typedef typename Matrixo::FermionField FermionFieldo;
+  typedef typename Matrixi::FermionField FermionFieldi;
+
+  PVinverter  & PauliVillarsSolvero;// For the outer field
+  SchurSolver & SchurSolveri;       // For the inner approx field
+  Guesser     & Guesseri;           // To deflate the inner approx solves
+
+  Matrixo & Mato;                   // Action object for outer
+  Matrixi & Mati;                   // Action object for inner
+
+  RealD target_resid;
+  int   maxiter;
+ public:
+
+  MADWF(Matrixo &_Mato,
+	Matrixi &_Mati, 
+	PVinverter &_PauliVillarsSolvero, 
+	SchurSolver &_SchurSolveri,
+	Guesser & _Guesseri,
+	RealD resid,
+	int _maxiter) :
+
+  Mato(_Mato),Mati(_Mati),
+    SchurSolveri(_SchurSolveri),
+    PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
+  {   
+    target_resid=resid;
+    maxiter     =_maxiter; 
+  };
+
+  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
+  {
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+
+    FermionFieldi    c0i(Mati.GaugeGrid()); // 4d 
+    FermionFieldi    y0i(Mati.GaugeGrid()); // 4d
+    FermionFieldo    c0 (Mato.GaugeGrid()); // 4d 
+    FermionFieldo    y0 (Mato.GaugeGrid()); // 4d
+
+    FermionFieldo    A(Mato.FermionGrid()); // Temporary outer
+    FermionFieldo    B(Mato.FermionGrid()); // Temporary outer
+    FermionFieldo    b(Mato.FermionGrid()); // 5d source
+
+    FermionFieldo    c(Mato.FermionGrid()); // PVinv source; reused so store
+    FermionFieldo    defect(Mato.FermionGrid()); // 5d source
+
+    FermionFieldi   ci(Mati.FermionGrid()); 
+    FermionFieldi   yi(Mati.FermionGrid()); 
+    FermionFieldi   xi(Mati.FermionGrid()); 
+    FermionFieldi srci(Mati.FermionGrid()); 
+    FermionFieldi   Ai(Mati.FermionGrid()); 
+
+    RealD m=Mati.Mass();
+
+    ///////////////////////////////////////
+    //Import source, include Dminus factors
+    ///////////////////////////////////////
+    Mato.ImportPhysicalFermionSource(src4,b); 
+    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
+    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;
+
+    defect = b;
+    sol5=Zero();
+    for (int i=0;i<maxiter;i++) {
+
+      ///////////////////////////////////////
+      // Set up c0 from current defect
+      ///////////////////////////////////////
+      PauliVillarsSolvero(Mato,defect,A);
+      Mato.Pdag(A,c);
+      ExtractSlice(c0, c, 0 , 0);
+
+      ////////////////////////////////////////////////
+      // Solve the inner system with surface term c0
+      ////////////////////////////////////////////////
+      ci = Zero();  
+      convert(c0,c0i); // Possible precison change
+      InsertSlice(c0i,ci,0, 0);
+
+      // Dwm P y = Dwm x = D(1) P (c0,0,0,0)^T
+      Mati.P(ci,Ai);
+      Mati.SetMass(1.0);      Mati.M(Ai,srci);      Mati.SetMass(m);
+      SchurSolveri(Mati,srci,xi,Guesseri); 
+      Mati.Pdag(xi,yi);
+      ExtractSlice(y0i, yi, 0 , 0);
+      convert(y0i,y0); // Possible precision change
+
+      //////////////////////////////////////
+      // Propagate solution back to outer system
+      // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
+      //////////////////////////////////////
+      c0 = - y0;
+      InsertSlice(c0, c, 0   , 0);
+
+      /////////////////////////////
+      // Reconstruct the bulk solution Pdag PV^-1 Dm P 
+      /////////////////////////////
+      Mato.P(c,B);
+      Mato.M(B,A);
+      PauliVillarsSolvero(Mato,A,B);
+      Mato.Pdag(B,A);
+
+      //////////////////////////////
+      // Reinsert surface prop
+      //////////////////////////////
+      InsertSlice(y0,A,0,0);
+
+      //////////////////////////////
+      // Convert from y back to x 
+      //////////////////////////////
+      Mato.P(A,B);
+
+      //         sol5' = sol5 + M^-1 defect
+      //               = sol5 + M^-1 src - M^-1 M sol5  ...
+      sol5 = sol5 + B;
+      std::cout << GridLogMessage << "***************************************" <<std::endl;
+      std::cout << GridLogMessage << " Sol5 update "<<std::endl;
+      std::cout << GridLogMessage << "***************************************" <<std::endl;
+      std::cout << GridLogMessage << " Sol5 now "<<norm2(sol5)<<std::endl;
+      std::cout << GridLogMessage << " delta    "<<norm2(B)<<std::endl;
+
+       // New defect  = b - M sol5
+       Mato.M(sol5,A);
+       defect = b - A;
+
+       std::cout << GridLogMessage << " defect   "<<norm2(defect)<<std::endl;
+
+       double resid = ::sqrt(norm2(defect) / norm2(b));
+       std::cout << GridLogMessage << "Residual " << i << ": " << resid  << std::endl;
+       std::cout << GridLogMessage << "***************************************" <<std::endl;
+
+       if (resid < target_resid) {
+	 return;
+       }
+    }
+
+    std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
+    assert(0);
+
+  }
+
+};
+
+NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermion.cc b/Grid/qcd/action/fermion/MobiusEOFAFermion.cc
similarity index 100%
rename from lib/qcd/action/fermion/MobiusEOFAFermion.cc
rename to Grid/qcd/action/fermion/MobiusEOFAFermion.cc
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/MobiusEOFAFermion.h
rename to Grid/qcd/action/fermion/MobiusEOFAFermion.h
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc b/Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
similarity index 100%
rename from lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
rename to Grid/qcd/action/fermion/MobiusEOFAFermioncache.cc
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermiondense.cc b/Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
similarity index 100%
rename from lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
rename to Grid/qcd/action/fermion/MobiusEOFAFermiondense.cc
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionssp.cc b/Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
similarity index 100%
rename from lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
rename to Grid/qcd/action/fermion/MobiusEOFAFermionssp.cc
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
similarity index 99%
rename from lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
rename to Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
index 97001033..ff8c5816 100644
--- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/Grid/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -868,7 +868,7 @@ void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, Fermio
 
 	  a0 = a0 + incr;
 	  a1 = a1 + incr;
-	  a2 = a2 + sizeof(Simd::scalar_type);
+              a2 = a2 + sizeof(typename Simd::scalar_type);
 	}
       }
 
diff --git a/lib/qcd/action/fermion/MobiusFermion.h b/Grid/qcd/action/fermion/MobiusFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/MobiusFermion.h
rename to Grid/qcd/action/fermion/MobiusFermion.h
diff --git a/lib/qcd/action/fermion/MobiusZolotarevFermion.h b/Grid/qcd/action/fermion/MobiusZolotarevFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/MobiusZolotarevFermion.h
rename to Grid/qcd/action/fermion/MobiusZolotarevFermion.h
diff --git a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
similarity index 92%
rename from lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
rename to Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
index 04079c2c..350e89e2 100644
--- a/lib/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
+++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h
@@ -40,8 +40,8 @@ public:
   INHERIT_IMPL_TYPES(Impl);
 public:
 
-  void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m) { 
-    this->MomentumSpacePropagatorHw(out,in,_m);
+     void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
+       this->MomentumSpacePropagatorHw(out,in,_m,twist);
   };
 
   // Constructors
diff --git a/lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
rename to Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h
diff --git a/lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
rename to Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h
diff --git a/lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
rename to Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h
diff --git a/lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
rename to Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h
diff --git a/lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
rename to Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h
diff --git a/lib/qcd/action/fermion/PartialFractionFermion5D.cc b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
similarity index 94%
rename from lib/qcd/action/fermion/PartialFractionFermion5D.cc
rename to Grid/qcd/action/fermion/PartialFractionFermion5D.cc
index f0a1a2f8..12b9c4ec 100644
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.cc
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.cc
@@ -390,6 +390,27 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
   amax=zolo_hi;
 }
 
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
+    {
+      int Ls = this->Ls;
+      conformable(solution5d.Grid(),this->FermionGrid());
+      conformable(exported4d.Grid(),this->GaugeGrid());
+      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
+    }
+    template<class Impl>
+    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
+    {
+      int Ls = this->Ls;
+      conformable(imported5d.Grid(),this->FermionGrid());
+      conformable(input4d.Grid()   ,this->GaugeGrid());
+      FermionField tmp(this->FermionGrid());
+      tmp=Zero();
+      InsertSlice(input4d, tmp, Ls-1, Ls-1);
+      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
+      this->Dminus(tmp,imported5d);
+    }
+
 // Constructors
 template<class Impl>
 PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
diff --git a/lib/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
similarity index 90%
rename from lib/qcd/action/fermion/PartialFractionFermion5D.h
rename to Grid/qcd/action/fermion/PartialFractionFermion5D.h
index 80365594..7a3de997 100644
--- a/lib/qcd/action/fermion/PartialFractionFermion5D.h
+++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h
@@ -68,6 +68,12 @@ public:
   // Efficient support for multigrid coarsening
   virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
 
+      ///////////////////////////////////////////////////////////////
+      // Physical surface field utilities
+      ///////////////////////////////////////////////////////////////
+      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
+      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
+
   // Constructors
   PartialFractionFermion5D(GaugeField &_Umu,
 			   GridCartesian         &FiveDimGrid,
diff --git a/Grid/qcd/action/fermion/PauliVillarsInverters.h b/Grid/qcd/action/fermion/PauliVillarsInverters.h
new file mode 100644
index 00000000..7d003087
--- /dev/null
+++ b/Grid/qcd/action/fermion/PauliVillarsInverters.h
@@ -0,0 +1,92 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field>
+class PauliVillarsSolverUnprec
+{
+ public:
+  ConjugateGradient<Field> & CG;
+  PauliVillarsSolverUnprec(  ConjugateGradient<Field> &_CG) : CG(_CG){};
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    RealD m = _Matrix.Mass();
+    Field A  (_Matrix.FermionGrid());
+
+    MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
+
+    _Matrix.SetMass(1.0);
+    _Matrix.Mdag(src,A);
+    CG(HermOp,A,sol);
+    _Matrix.SetMass(m);
+  };
+};
+
+template<class Field,class SchurSolverType>
+class PauliVillarsSolverRBprec
+{
+ public:
+  SchurSolverType & SchurSolver;
+  PauliVillarsSolverRBprec( SchurSolverType &_SchurSolver) : SchurSolver(_SchurSolver){};
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    RealD m = _Matrix.Mass();
+    Field A  (_Matrix.FermionGrid());
+
+    _Matrix.SetMass(1.0);
+    SchurSolver(_Matrix,src,sol);
+    _Matrix.SetMass(m);
+  };
+};
+
+template<class Field,class GaugeField>
+class PauliVillarsSolverFourierAccel
+{
+ public:
+  GaugeField      & Umu;
+  ConjugateGradient<Field> & CG;
+
+  PauliVillarsSolverFourierAccel(GaugeField &_Umu,ConjugateGradient<Field> &_CG) :  Umu(_Umu), CG(_CG)
+  {
+  };
+
+  template<class Matrix>
+  void operator() (Matrix &_Matrix,const Field &src,Field &sol)
+  {
+    FourierAcceleratedPV<Field, Matrix, typename Matrix::GaugeField > faPV(_Matrix,Umu,CG) ;
+    faPV.pvInv(src,sol);
+  };
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/Reconstruct5Dprop.h b/Grid/qcd/action/fermion/Reconstruct5Dprop.h
new file mode 100644
index 00000000..93af1ab8
--- /dev/null
+++ b/Grid/qcd/action/fermion/Reconstruct5Dprop.h
@@ -0,0 +1,134 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
+ private:
+  PVinverter & PauliVillarsSolver;
+ public:
+
+ /////////////////////////////////////////////////////
+ // First cut works, 10 Oct 2018.
+ //
+ // Must form a plan to get this into production for Zmobius acceleration
+ // of the Mobius exact AMA corrections.
+ //
+ // TODO : understand absence of contact term in eqns in Hantao's thesis
+ //        sol4 is contact term subtracted, but thesis & Brower's paper suggests not.
+ //
+ // Step 1: Localise PV inverse in a routine. [DONE]
+ // Step 2: Schur based PV inverse            [DONE]
+ // Step 3: Fourier accelerated PV inverse    [DONE]
+ //
+ /////////////////////////////////////////////////////
+ 
+  Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver) 
+    : PauliVillarsSolver(_PauliVillarsSolver) 
+  { 
+  };
+
+
+   template<class Matrix>
+   void PV(Matrix &_Matrix,const Field &src,Field &sol)
+   {
+     RealD m = _Matrix.Mass();
+     _Matrix.SetMass(1.0);
+     _Matrix.M(src,sol);
+     _Matrix.SetMass(m);
+   }
+   template<class Matrix>
+   void PVdag(Matrix &_Matrix,const Field &src,Field &sol)
+   {
+     RealD m = _Matrix.Mass();
+     _Matrix.SetMass(1.0);
+     _Matrix.Mdag(src,sol);
+     _Matrix.SetMass(m);
+   }
+  template<class Matrix>
+  void operator() (Matrix & _Matrix,const Field &sol4,const Field &src4, Field &sol5){
+
+    int Ls =  _Matrix.Ls;
+
+    Field psi4(_Matrix.GaugeGrid());
+    Field psi(_Matrix.FermionGrid());
+    Field A  (_Matrix.FermionGrid());
+    Field B  (_Matrix.FermionGrid());
+    Field c  (_Matrix.FermionGrid());
+
+    typedef typename Matrix::Coeff_t Coeff_t;
+
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+    std::cout << GridLogMessage<< " Reconstruct5Dprop: c.f. MADWF algorithm         " << std::endl;
+    std::cout << GridLogMessage<< " ************************************************" << std::endl;
+
+    ///////////////////////////////////////
+    //Import source, include Dminus factors
+    ///////////////////////////////////////
+    _Matrix.ImportPhysicalFermionSource(src4,B); 
+
+    ///////////////////////////////////////
+    // Set up c from src4
+    ///////////////////////////////////////
+    PauliVillarsSolver(_Matrix,B,A);
+    _Matrix.Pdag(A,c);
+
+    //////////////////////////////////////
+    // Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
+    //////////////////////////////////////
+    psi4 = - sol4;
+    InsertSlice(psi4, psi, 0   , 0);
+    for (int s=1;s<Ls;s++) {
+      ExtractSlice(psi4,c,s,0);
+       InsertSlice(psi4,psi,s,0);
+    }
+
+    /////////////////////////////
+    // Pdag PV^-1 Dm P 
+    /////////////////////////////
+    _Matrix.P(psi,B);
+    _Matrix.M(B,A);
+    PauliVillarsSolver(_Matrix,A,B);
+    _Matrix.Pdag(B,A);
+
+    //////////////////////////////
+    // Reinsert surface prop
+    //////////////////////////////
+    InsertSlice(sol4,A,0,0);
+
+    //////////////////////////////
+    // Convert from y back to x 
+    //////////////////////////////
+    _Matrix.P(A,sol5);
+    
+  }
+};
+
+NAMESPACE_END(Grid);
+
diff --git a/lib/qcd/action/fermion/ScaledShamirFermion.h b/Grid/qcd/action/fermion/ScaledShamirFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/ScaledShamirFermion.h
rename to Grid/qcd/action/fermion/ScaledShamirFermion.h
diff --git a/lib/qcd/action/fermion/SchurDiagTwoKappa.h b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h
similarity index 100%
rename from lib/qcd/action/fermion/SchurDiagTwoKappa.h
rename to Grid/qcd/action/fermion/SchurDiagTwoKappa.h
diff --git a/lib/qcd/action/fermion/ShamirZolotarevFermion.h b/Grid/qcd/action/fermion/ShamirZolotarevFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/ShamirZolotarevFermion.h
rename to Grid/qcd/action/fermion/ShamirZolotarevFermion.h
diff --git a/Grid/qcd/action/fermion/StaggeredKernels.cc b/Grid/qcd/action/fermion/StaggeredKernels.cc
new file mode 100644
index 00000000..fa2cf155
--- /dev/null
+++ b/Grid/qcd/action/fermion/StaggeredKernels.cc
@@ -0,0 +1,294 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
+int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
+
+#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
+  SE = st.GetEntry(ptype, Dir+skew, sF);			\
+  if (SE->_is_local ) {						\
+    if (SE->_permute) {						\
+      chi_p = &chi;						\
+      permute(chi,  in[SE->_offset], ptype);		\
+    } else {							\
+      chi_p = &in[SE->_offset];				\
+    }								\
+  } else {							\
+    chi_p = &buf[SE->_offset];					\
+  }								\
+  multLink(Uchi, U[sU], *chi_p, Dir);			
+
+#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
+  SE = st.GetEntry(ptype, Dir+skew, sF);			\
+  if (SE->_is_local ) {						\
+    if (SE->_permute) {						\
+      chi_p = &chi;						\
+      permute(chi,  in[SE->_offset], ptype);		\
+    } else {							\
+      chi_p = &in[SE->_offset];				\
+    }								\
+  } else if ( st.same_node[Dir] ) {				\
+    chi_p = &buf[SE->_offset];					\
+  }								\
+  if (SE->_is_local || st.same_node[Dir] ) {			\
+    multLink(Uchi, U[sU], *chi_p, Dir);			\
+  }
+
+#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
+  SE = st.GetEntry(ptype, Dir+skew, sF);			\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    nmu++;							\
+    chi_p = &buf[SE->_offset];					\
+    multLink(Uchi, U[sU], *chi_p, Dir);			\
+  }
+
+template <class Impl>
+StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
+
+////////////////////////////////////////////////////////////////////////////////////
+// Generic implementation; move to different file?
+// Int, Ext, Int+Ext cases for comms overlap
+////////////////////////////////////////////////////////////////////////////////////
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+					     SiteSpinor *buf, int LLs, int sU, 
+					     const FermionFieldView &in, FermionFieldView &out, int dag) {
+  const SiteSpinor *chi_p;
+  SiteSpinor chi;
+  SiteSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=LLs*sU+s;
+    skew = 0;
+    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
+    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
+    skew=8;
+    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
+    if ( dag ) { 
+      Uchi = - Uchi;
+    } 
+    vstream(out[sF], Uchi);
+  }
+};
+
+  ///////////////////////////////////////////////////
+  // Only contributions from interior of our node
+  ///////////////////////////////////////////////////
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+						SiteSpinor *buf, int LLs, int sU, 
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
+  const SiteSpinor *chi_p;
+  SiteSpinor chi;
+  SiteSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  int skew ;
+
+  for(int s=0;s<LLs;s++){
+    int sF=LLs*sU+s;
+    skew = 0;
+    Uchi=Zero();
+    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
+    skew=8;
+    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
+    if ( dag ) {
+      Uchi = - Uchi;
+    }
+    vstream(out[sF], Uchi);
+  }
+};
+
+
+  ///////////////////////////////////////////////////
+  // Only contributions from exterior of our node
+  ///////////////////////////////////////////////////
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+						SiteSpinor *buf, int LLs, int sU,
+						const FermionFieldView &in, FermionFieldView &out,int dag) {
+  const SiteSpinor *chi_p;
+  SiteSpinor chi;
+  SiteSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  int skew ;
+
+  for(int s=0;s<LLs;s++){
+    int sF=LLs*sU+s;
+    skew = 0;
+    Uchi=Zero();
+    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
+    skew=8;
+    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
+    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
+
+    if ( nmu ) { 
+      if ( dag ) { 
+	out[sF] = out[sF] - Uchi;
+      } else { 
+	out[sF] = out[sF] + Uchi;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////
+// Driving / wrapping routine to select right kernel
+////////////////////////////////////////////////////////////////////////////////////
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+					 SiteSpinor *buf, int LLs, int sU,
+					 const FermionFieldView &in, FermionFieldView &out,
+					 int interior,int exterior)
+{
+  int dag=1;
+  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
+};
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+				      SiteSpinor *buf, int LLs, int sU,
+				      const FermionFieldView &in, FermionFieldView &out,
+				      int interior,int exterior)
+{
+  int dag=0;
+  DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
+};
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+				      SiteSpinor *buf, int LLs,
+				      int sU, const FermionFieldView &in, FermionFieldView &out,
+				      int dag,int interior,int exterior) 
+{
+  switch(Opt) {
+#ifdef AVX512
+  case OptInlineAsm:
+    if ( interior && exterior ) {
+      DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else { 
+      std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
+      assert(0);
+    }
+    break;
+#endif
+  case OptHandUnroll:
+    if ( interior && exterior ) {
+      DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( interior ) {
+      DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( exterior ) {
+      DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    }
+    break;
+  case OptGeneric:
+    if ( interior && exterior ) {
+      DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( interior ) {
+      DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    } else if ( exterior ) {
+      DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
+    }
+    break;
+  default:
+    std::cout<<"Oops Opt = "<<Opt<<std::endl;
+    assert(0);
+    break;
+  }
+};
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
+				      int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
+{
+  // Disp should be either +1,-1,+3,-3
+  // What about "dag" ?
+  // Because we work out pU . dS/dU 
+  // U
+  assert(0);
+}
+
+FermOpStaggeredTemplateInstantiate(StaggeredKernels);
+FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
+
+NAMESPACE_END(Grid);
+
+
diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h
new file mode 100644
index 00000000..e833a093
--- /dev/null
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -0,0 +1,117 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/StaggeredKernels.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi, Peter Boyle
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid)
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Helper routines that implement Staggered stencil for a single site.
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class StaggeredKernelsStatic { 
+ public:
+  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
+  enum { CommsAndCompute, CommsThenCompute };
+  static int Opt;
+  static int Comms;
+};
+ 
+template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic { 
+ public:
+   
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+   
+public:
+    
+   void DhopDir(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+		int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
+
+   ///////////////////////////////////////////////////////////////////////////////////////
+   // Generic Nc kernels
+   ///////////////////////////////////////////////////////////////////////////////////////
+   void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, 
+			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
+			SiteSpinor * buf, int LLs, int sU, 
+			const FermionFieldView &in, FermionFieldView &out,int dag);
+   void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, 
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
+			   SiteSpinor * buf, int LLs, int sU, 
+			   const FermionFieldView &in, FermionFieldView &out,int dag);
+   void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, 
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+			   SiteSpinor * buf, int LLs, int sU, 
+			   const FermionFieldView &in, FermionFieldView &out,int dag);
+
+   ///////////////////////////////////////////////////////////////////////////////////////
+   // Nc=3 specific kernels
+   ///////////////////////////////////////////////////////////////////////////////////////
+   void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
+		     SiteSpinor * buf, int LLs, int sU, 
+		     const FermionFieldView &in, FermionFieldView &out,int dag);
+   void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
+			SiteSpinor * buf, int LLs, int sU, 
+			const FermionFieldView &in, FermionFieldView &out,int dag);
+   void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
+			SiteSpinor * buf, int LLs, int sU, 
+			const FermionFieldView &in, FermionFieldView &out,int dag);
+
+   ///////////////////////////////////////////////////////////////////////////////////////
+   // Asm Nc=3 specific kernels
+   ///////////////////////////////////////////////////////////////////////////////////////
+   void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
+		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
+		    SiteSpinor * buf, int LLs, int sU, 
+		    const FermionFieldView &in, FermionFieldView &out,int dag);
+   ///////////////////////////////////////////////////////////////////////////////////////////////////
+   // Generic interface; fan out to right routine
+   ///////////////////////////////////////////////////////////////////////////////////////////////////
+   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
+		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
+		 SiteSpinor * buf, int LLs, int sU,
+		 const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
+
+   void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, 
+		    DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
+		    SiteSpinor * buf, int LLs, int sU,
+		    const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
+
+   void DhopSite(StencilImpl &st, LebesgueOrder &lo, 
+		 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
+		 SiteSpinor * buf, int LLs, int sU,
+		 const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
+  
+public:
+
+  StaggeredKernels(const ImplParams &p = ImplParams());
+
+};
+NAMESPACE_END(Grid);    
diff --git a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
similarity index 93%
rename from lib/qcd/action/fermion/StaggeredKernelsAsm.cc
rename to Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
index 684c0f79..3eafb610 100644
--- a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -560,6 +560,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
        VSTORE(2,%0,pUChi_02)					\
        : : "r" (out) : "memory" );
 
+#define nREDUCE(out)							\
+  asm (									\
+       VADD(UChi_00,UChi_10,UChi_00)					\
+       VADD(UChi_01,UChi_11,UChi_01)					\
+       VADD(UChi_02,UChi_12,UChi_02)					\
+       VADD(UChi_30,UChi_20,UChi_30)					\
+       VADD(UChi_31,UChi_21,UChi_31)					\
+       VADD(UChi_32,UChi_22,UChi_32)					\
+       VADD(UChi_00,UChi_30,UChi_00)					\
+       VADD(UChi_01,UChi_31,UChi_01)					\
+       VADD(UChi_02,UChi_32,UChi_02)				);	\
+  asm (VZERO(Chi_00)							\
+       VSUB(UChi_00,Chi_00,UChi_00)					\
+       VSUB(UChi_01,Chi_00,UChi_01)					\
+       VSUB(UChi_02,Chi_00,UChi_02)				);	\
+  asm (								\
+       VSTORE(0,%0,pUChi_00)					\
+       VSTORE(1,%0,pUChi_01)					\
+       VSTORE(2,%0,pUChi_02)					\
+       : : "r" (out) : "memory" );
+
 #define REDUCEa(out)					\
   asm (							\
   VADD(UChi_00,UChi_10,UChi_00)				\
@@ -571,6 +592,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   VSTORE(2,%0,pUChi_02)					\
   : : "r" (out) : "memory" );
 
+// FIXME is sign right in the VSUB ?
+#define nREDUCEa(out)					\
+  asm (							\
+  VADD(UChi_00,UChi_10,UChi_00)				\
+  VADD(UChi_01,UChi_11,UChi_01)				\
+  VADD(UChi_02,UChi_12,UChi_02)	);			\
+  asm (VZERO(Chi_00)							\
+       VSUB(UChi_00,Chi_00,UChi_00)					\
+       VSUB(UChi_01,Chi_00,UChi_01)					\
+       VSUB(UChi_02,Chi_00,UChi_02)				);	\
+  asm (									\
+       VSTORE(0,%0,pUChi_00)				\
+       VSTORE(1,%0,pUChi_01)				\
+       VSTORE(2,%0,pUChi_02)				\
+       : : "r" (out) : "memory" );
+
 #define PERMUTE_DIR(dir)			\
       permute##dir(Chi_0,Chi_0);\
       permute##dir(Chi_1,Chi_1);\
@@ -583,7 +620,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
 					 DoubledGaugeFieldView &U,
 					 DoubledGaugeFieldView &UUU,
 					 SiteSpinor *buf, int LLs,
-					 int sU, const FermionFieldView &in, FermionFieldView &out) 
+					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
   assert(0);
 };
@@ -647,7 +684,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
 								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out) 
+								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
   uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -684,7 +721,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
     MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
 
     addr0 = (uint64_t) &out[sF];
-    REDUCE(addr0);
+    if ( dag ) {
+      nREDUCE(addr0);
+    } else { 
+      REDUCE(addr0);
+    }
    }
 #else 
     assert(0);
@@ -697,7 +738,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
 								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out) 
+								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
   uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -733,7 +774,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
     MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
 
     addr0 = (uint64_t) &out[sF];
-    REDUCE(addr0);
+    if ( dag ) {
+      nREDUCE(addr0);
+    } else { 
+      REDUCE(addr0);
+    }
   }
 #else 
   assert(0);
@@ -775,10 +820,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 
 #include <simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeFieldView &U,
-								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out) 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
+							       SiteSpinor *buf, int LLs,
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
   uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -831,7 +876,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
     MULT_ADD_XYZT(gauge2,gauge3);  
 
     addr0 = (uint64_t) &out[sF];
-    REDUCEa(addr0);
+    if ( dag ) { 
+      nREDUCEa(addr0);
+    } else { 
+      REDUCEa(addr0);
+    }
   }
 #else 
   assert(0);
@@ -840,10 +889,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 
 #include <simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
-								    DoubledGaugeFieldView &U,
-								    DoubledGaugeFieldView &UUU,
-								    SiteSpinor *buf, int LLs,
-								    int sU, const FermionFieldView &in, FermionFieldView &out) 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
+							       SiteSpinor *buf, int LLs,
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
   uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -896,7 +945,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
     MULT_ADD_XYZT(gauge2,gauge3);  
     
     addr0 = (uint64_t) &out[sF];
-    REDUCEa(addr0);
+    if ( dag ) {
+      nREDUCEa(addr0);
+    } else { 
+      REDUCEa(addr0);
+    }
   }
 #else 
   assert(0);
@@ -908,7 +961,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
 				  DoubledGaugeFieldView &U,			\
 				  DoubledGaugeFieldView &UUU,		\
 				  SiteSpinor *buf, int LLs,		\
-				  int sU, const FermionFieldView &in, FermionFieldView &out);
+				  int sU, const FermionFieldView &in, FermionFieldView &out,int dag);
 
 KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
 KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
diff --git a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
new file mode 100644
index 00000000..eefd4d40
--- /dev/null
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -0,0 +1,396 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#define LOAD_CHI(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+    Chi_0=ref()()(0);\
+    Chi_1=ref()()(1);\
+    Chi_2=ref()()(2);
+
+
+// To splat or not to splat depends on the implementation
+#define MULT(A,UChi)				\
+  auto & ref(U[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));      \
+   Impl::loadLinkElement(U_10,ref()(1,0));      \
+   Impl::loadLinkElement(U_20,ref()(2,0));      \
+   Impl::loadLinkElement(U_01,ref()(0,1));      \
+   Impl::loadLinkElement(U_11,ref()(1,1));      \
+   Impl::loadLinkElement(U_21,ref()(2,1));      \
+   Impl::loadLinkElement(U_02,ref()(0,2));     \
+   Impl::loadLinkElement(U_12,ref()(1,2));     \
+   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    UChi ## _0  = U_00*Chi_0;	       \
+    UChi ## _1  = U_10*Chi_0;\
+    UChi ## _2  = U_20*Chi_0;\
+    UChi ## _0 += U_01*Chi_1;\
+    UChi ## _1 += U_11*Chi_1;\
+    UChi ## _2 += U_21*Chi_1;\
+    UChi ## _0 += U_02*Chi_2;\
+    UChi ## _1 += U_12*Chi_2;\
+    UChi ## _2 += U_22*Chi_2;
+
+#define MULT_ADD(U,A,UChi)			\
+  auto & ref(U[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));      \
+   Impl::loadLinkElement(U_10,ref()(1,0));      \
+   Impl::loadLinkElement(U_20,ref()(2,0));      \
+   Impl::loadLinkElement(U_01,ref()(0,1));      \
+   Impl::loadLinkElement(U_11,ref()(1,1));      \
+   Impl::loadLinkElement(U_21,ref()(2,1));      \
+   Impl::loadLinkElement(U_02,ref()(0,2));     \
+   Impl::loadLinkElement(U_12,ref()(1,2));     \
+   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    UChi ## _0 += U_00*Chi_0;	       \
+    UChi ## _1 += U_10*Chi_0;\
+    UChi ## _2 += U_20*Chi_0;\
+    UChi ## _0 += U_01*Chi_1;\
+    UChi ## _1 += U_11*Chi_1;\
+    UChi ## _2 += U_21*Chi_1;\
+    UChi ## _0 += U_02*Chi_2;\
+    UChi ## _1 += U_12*Chi_2;\
+    UChi ## _2 += U_22*Chi_2;
+
+
+#define PERMUTE_DIR(dir)			\
+  permute##dir(Chi_0,Chi_0);			\
+  permute##dir(Chi_1,Chi_1);			\
+  permute##dir(Chi_2,Chi_2);
+
+
+#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
+  SE=st.GetEntry(ptype,Dir+skew,sF);	\
+  offset = SE->_offset;			\
+  local  = SE->_is_local;		\
+  perm   = SE->_permute;		\
+  if ( local ) {						\
+    LOAD_CHI(in);					\
+    if ( perm) {						\
+      PERMUTE_DIR(Perm);					\
+    }								\
+  } else {							\
+    LOAD_CHI(buf);						\
+  }								
+
+#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
+  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
+  {								\
+    MULT(Dir,even);						\
+  }
+
+#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\
+  HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\
+  {								\
+    MULT_ADD(U,Dir,even);					\
+  }
+
+
+
+#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
+  SE=st.GetEntry(ptype,Dir+skew,sF);			\
+  offset = SE->_offset;					\
+  local  = SE->_is_local;				\
+  perm   = SE->_permute;				\
+  if ( local ) {					\
+    LOAD_CHI(in);				\
+    if ( perm) {					\
+      PERMUTE_DIR(Perm);				\
+    }							\
+  } else if ( st.same_node[Dir] ) {			\
+    LOAD_CHI(buf);					\
+  }							\
+  if (SE->_is_local || st.same_node[Dir] ) {		\
+    MULT_ADD(U,Dir,even);				\
+  }
+
+#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\
+  SE=st.GetEntry(ptype,Dir+skew,sF);			\
+  offset = SE->_offset;					\
+  local  = SE->_is_local;				\
+  perm   = SE->_permute;				\
+  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
+    nmu++;							\
+    { LOAD_CHI(buf);	  }					\
+    { MULT_ADD(U,Dir,even); }					\
+  }								
+
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, 
+					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
+					  SiteSpinor *buf, int LLs, int sU, 
+					  const FermionFieldView &in, FermionFieldView &out,int dag) 
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  Simd even_0; // 12 regs on knc
+  Simd even_1;
+  Simd even_2;
+  Simd odd_0; // 12 regs on knc
+  Simd odd_1;
+  Simd odd_2;
+
+  Simd Chi_0;    // two spinor; 6 regs
+  Simd Chi_1;
+  Simd Chi_2;
+  
+  Simd U_00;  // two rows of U matrix
+  Simd U_10;
+  Simd U_20;  
+  Simd U_01;
+  Simd U_11;
+  Simd U_21;  // 2 reg left.
+  Simd U_02;
+  Simd U_12;
+  Simd U_22; 
+
+  SiteSpinor result;
+  int offset,local,perm, ptype;
+
+  StencilEntry *SE;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+
+    skew = 0;
+    HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);  
+    HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);   
+    HAND_STENCIL_LEG      (U,Zp,1,skew,even);  
+    HAND_STENCIL_LEG      (U,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG      (U,Xm,3,skew,even);  
+    HAND_STENCIL_LEG      (U,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG      (U,Zm,1,skew,even);  
+    HAND_STENCIL_LEG      (U,Tm,0,skew,odd);  
+    skew = 8;
+    HAND_STENCIL_LEG(UUU,Xp,3,skew,even);  
+    HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG(UUU,Zp,1,skew,even);  
+    HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG(UUU,Xm,3,skew,even);  
+    HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG(UUU,Zm,1,skew,even);  
+    HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);  
+    
+    if ( dag ) {
+      result()()(0) = - even_0 - odd_0;
+      result()()(1) = - even_1 - odd_1;
+      result()()(2) = - even_2 - odd_2;
+    } else { 
+      result()()(0) = even_0 + odd_0;
+      result()()(1) = even_1 + odd_1;
+      result()()(2) = even_2 + odd_2;
+    }
+    vstream(out[sF],result);
+  }
+}
+
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, 
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+					     SiteSpinor *buf, int LLs, int sU, 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  Simd even_0; // 12 regs on knc
+  Simd even_1;
+  Simd even_2;
+  Simd odd_0; // 12 regs on knc
+  Simd odd_1;
+  Simd odd_2;
+
+  Simd Chi_0;    // two spinor; 6 regs
+  Simd Chi_1;
+  Simd Chi_2;
+  
+  Simd U_00;  // two rows of U matrix
+  Simd U_10;
+  Simd U_20;  
+  Simd U_01;
+  Simd U_11;
+  Simd U_21;  // 2 reg left.
+  Simd U_02;
+  Simd U_12;
+  Simd U_22; 
+
+  SiteSpinor result;
+  int offset,local,perm, ptype;
+
+  StencilEntry *SE;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+
+    skew = 0;
+    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);  
+    skew = 8;
+    HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);  
+
+    // Assume every site must be connected to at least one interior point. No 1^4 subvols.
+    if ( dag ) {
+      result()()(0) = - even_0 - odd_0;
+      result()()(1) = - even_1 - odd_1;
+      result()()(2) = - even_2 - odd_2;
+    } else { 
+      result()()(0) = even_0 + odd_0;
+      result()()(1) = even_1 + odd_1;
+      result()()(2) = even_2 + odd_2;
+    }
+    vstream(out[sF],result);
+  }
+}
+
+
+template <class Impl>
+void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, 
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
+					     SiteSpinor *buf, int LLs, int sU, 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  Simd even_0; // 12 regs on knc
+  Simd even_1;
+  Simd even_2;
+  Simd odd_0; // 12 regs on knc
+  Simd odd_1;
+  Simd odd_2;
+
+  Simd Chi_0;    // two spinor; 6 regs
+  Simd Chi_1;
+  Simd Chi_2;
+  
+  Simd U_00;  // two rows of U matrix
+  Simd U_10;
+  Simd U_20;  
+  Simd U_01;
+  Simd U_11;
+  Simd U_21;  // 2 reg left.
+  Simd U_02;
+  Simd U_12;
+  Simd U_22; 
+
+  SiteSpinor result;
+  int offset,local,perm, ptype;
+
+  StencilEntry *SE;
+  int skew;
+
+  for(int s=0;s<LLs;s++){
+    int sF=s+LLs*sU;
+
+    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    int nmu=0;
+    skew = 0;
+    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);  
+    skew = 8;
+    HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);  
+    HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);   
+    HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);  
+    HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);  
+
+    // Add sum of all exterior connected stencil legs
+    if ( nmu ) { 
+      if ( dag ) {
+	result()()(0) = - even_0 - odd_0;
+	result()()(1) = - even_1 - odd_1;
+	result()()(2) = - even_2 - odd_2;
+      } else { 
+	result()()(0) = even_0 + odd_0;
+	result()()(1) = even_1 + odd_1;
+	result()()(2) = even_2 + odd_2;
+      }
+      out[sF] = out[sF] + result;
+    }
+  }
+}
+
+
+#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
+  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
+						     SiteSpinor *buf, int LLs, int sU, \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
+									\
+  template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
+						     SiteSpinor *buf, int LLs, int sU, \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
+									\
+  template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
+						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
+						     SiteSpinor *buf, int LLs, int sU, \
+						     const FermionFieldView &in, FermionFieldView &out, int dag); \
+
+DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
+DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
+
+NAMESPACE_END(Grid);
+
+
diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.cc b/Grid/qcd/action/fermion/WilsonCloverFermion.cc
new file mode 100644
index 00000000..d64185ba
--- /dev/null
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.cc
@@ -0,0 +1,240 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
+
+    Copyright (C) 2017
+
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <Grid/qcd/spin/Dirac.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// *NOT* EO
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out.Grid());
+
+  // Wilson term
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerNo);
+
+  // Clover term
+  Mooee(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
+{
+  FermionField temp(out.Grid());
+
+  // Wilson term
+  out.Checkerboard() = in.Checkerboard();
+  this->Dhop(in, out, DaggerYes);
+
+  // Clover term
+  MooeeDag(in, temp);
+
+  out += temp;
+  return norm2(out);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  WilsonFermion<Impl>::ImportGauge(_Umu);
+  GridBase *grid = _Umu.Grid();
+  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
+
+  // Compute the field strength terms mu>nu
+  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
+  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
+  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
+
+  // Compute the Clover Operator acting on Colour and Spin
+  // multiply here by the clover coefficients for the anisotropy
+  CloverTerm  = fillCloverYZ(Bx) * csw_r;
+  CloverTerm += fillCloverXZ(By) * csw_r;
+  CloverTerm += fillCloverXY(Bz) * csw_r;
+  CloverTerm += fillCloverXT(Ex) * csw_t;
+  CloverTerm += fillCloverYT(Ey) * csw_t;
+  CloverTerm += fillCloverZT(Ez) * csw_t;
+  CloverTerm += diag_mass;
+
+  int lvol = _Umu.Grid()->lSites();
+  int DimRep = Impl::Dimension;
+
+  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+
+  Coordinate lcoor;
+  typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
+
+  for (int site = 0; site < lvol; site++)
+  {
+    grid->LocalIndexToLocalCoor(site, lcoor);
+    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
+    peekLocalSite(Qx, CloverTerm, lcoor);
+    Qxinv = Zero();
+    //if (csw!=0){
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
+    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
+
+    EigenInvCloverOp = EigenCloverOp.inverse();
+    //std::cout << EigenInvCloverOp << std::endl;
+    for (int j = 0; j < Ns; j++)
+      for (int k = 0; k < Ns; k++)
+        for (int a = 0; a < DimRep; a++)
+          for (int b = 0; b < DimRep; b++)
+            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
+    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
+    //  }
+    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
+  }
+
+  // Separate the even and odd parts
+  pickCheckerboard(Even, CloverTermEven, CloverTerm);
+  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
+
+  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+
+  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
+  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
+
+  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseNo);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerNo, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  this->MooeeInternal(in, out, DaggerYes, InverseYes);
+}
+
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
+{
+  out.Checkerboard() = in.Checkerboard();
+  CloverFieldType *Clover;
+  assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
+
+  if (dag)
+  {
+    if (in.Grid()->_isCheckerBoarded)
+    {
+      if (in.Checkerboard() == Odd)
+      {
+        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
+      }
+      else
+      {
+        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
+      }
+      out = *Clover * in;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = adj(*Clover) * in;
+    }
+  }
+  else
+  {
+    if (in.Grid()->_isCheckerBoarded)
+    {
+
+      if (in.Checkerboard() == Odd)
+      {
+        //  std::cout << "Calling clover term Odd" << std::endl;
+        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
+      }
+      else
+      {
+        //  std::cout << "Calling clover term Even" << std::endl;
+        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
+      }
+      out = *Clover * in;
+      //  std::cout << GridLogMessage << "*Clover.Checkerboard() "  << (*Clover).Checkerboard() << std::endl;
+    }
+    else
+    {
+      Clover = (inv) ? &CloverTermInv : &CloverTerm;
+      out = *Clover * in;
+    }
+  }
+
+} // MooeeInternal
+
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
+{
+  assert(0);
+}
+
+// Derivative parts
+template <class Impl>
+void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
+{
+  assert(0); // not implemented yet
+}
+
+FermOpTemplateInstantiate(WilsonCloverFermion);
+AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
+TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
+//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h
new file mode 100644
index 00000000..0d5c9616
--- /dev/null
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -0,0 +1,374 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
+
+    Copyright (C) 2017
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+    Author: David Preti <>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/Grid.h>
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////
+// Wilson Clover
+//
+// Operator ( with anisotropy coefficients):
+//
+// Q =   1 + (Nd-1)/xi_0 + m
+//     + W_t + (nu/xi_0) * W_s
+//     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
+//
+// s spatial, t temporal directions.
+// where W_t and W_s are the temporal and spatial components of the
+// Wilson Dirac operator
+//
+// csw_r = csw_t to recover the isotropic version
+//////////////////////////////////////////////////////////////////
+
+template <class Impl>
+class WilsonCloverFermion : public WilsonFermion<Impl>
+{
+public:
+  // Types definitions
+  INHERIT_IMPL_TYPES(Impl);
+  template <typename vtype>
+  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
+  typedef iImplClover<Simd> SiteCloverType;
+  typedef Lattice<SiteCloverType> CloverFieldType;
+
+public:
+  typedef WilsonFermion<Impl> WilsonBase;
+
+  virtual void Instantiatable(void){};
+  // Constructors
+  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                      GridRedBlackCartesian &Hgrid,
+                      const RealD _mass,
+                      const RealD _csw_r = 0.0,
+                      const RealD _csw_t = 0.0,
+                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
+                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
+                                                                                     Fgrid,
+                                                                                     Hgrid,
+                                                                                     _mass, impl_p, clover_anisotropy),
+                                                                 CloverTerm(&Fgrid),
+                                                                 CloverTermInv(&Fgrid),
+                                                                 CloverTermEven(&Hgrid),
+                                                                 CloverTermOdd(&Hgrid),
+                                                                 CloverTermInvEven(&Hgrid),
+                                                                 CloverTermInvOdd(&Hgrid),
+                                                                 CloverTermDagEven(&Hgrid),
+                                                                 CloverTermDagOdd(&Hgrid),
+                                                                 CloverTermInvDagEven(&Hgrid),
+                                                                 CloverTermInvDagOdd(&Hgrid)
+  {
+    assert(Nd == 4); // require 4 dimensions
+
+    if (clover_anisotropy.isAnisotropic)
+    {
+      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
+      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
+    }
+    else
+    {
+      csw_r = _csw_r * 0.5;
+      diag_mass = 4.0 + _mass;
+    }
+    csw_t = _csw_t * 0.5;
+
+    if (csw_r == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
+    if (csw_t == 0)
+      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
+
+    ImportGauge(_Umu);
+  }
+
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
+
+  virtual void Mooee(const FermionField &in, FermionField &out);
+  virtual void MooeeDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInv(const FermionField &in, FermionField &out);
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
+  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
+
+  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
+
+  void ImportGauge(const GaugeField &_Umu);
+
+  // Derivative parts unpreconditioned pseudofermions
+  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
+  {
+    conformable(X.Grid(), Y.Grid());
+    conformable(X.Grid(), force.Grid());
+    GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
+    GaugeField clover_force(force.Grid());
+    PropagatorField Lambda(force.Grid());
+
+    // Guido: Here we are hitting some performance issues:
+    // need to extract the components of the DoubledGaugeField
+    // for each call
+    // Possible solution
+    // Create a vector object to store them? (cons: wasting space)
+    std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
+
+    Impl::extractLinkField(U, this->Umu);
+
+    force = Zero();
+    // Derivative of the Wilson hopping term
+    this->DhopDeriv(force, X, Y, dag);
+
+    ///////////////////////////////////////////////////////////
+    // Clover term derivative
+    ///////////////////////////////////////////////////////////
+    Impl::outerProductImpl(Lambda, X, Y);
+    //std::cout << "Lambda:" << Lambda << std::endl;
+
+    Gamma::Algebra sigma[] = {
+        Gamma::Algebra::SigmaXY,
+        Gamma::Algebra::SigmaXZ,
+        Gamma::Algebra::SigmaXT,
+        Gamma::Algebra::MinusSigmaXY,
+        Gamma::Algebra::SigmaYZ,
+        Gamma::Algebra::SigmaYT,
+        Gamma::Algebra::MinusSigmaXZ,
+        Gamma::Algebra::MinusSigmaYZ,
+        Gamma::Algebra::SigmaZT,
+        Gamma::Algebra::MinusSigmaXT,
+        Gamma::Algebra::MinusSigmaYT,
+        Gamma::Algebra::MinusSigmaZT};
+
+    /*
+      sigma_{\mu \nu}=
+      | 0         sigma[0]  sigma[1]  sigma[2] |
+      | sigma[3]    0       sigma[4]  sigma[5] |
+      | sigma[6]  sigma[7]     0      sigma[8] |
+      | sigma[9]  sigma[10] sigma[11]   0      |
+    */
+
+    int count = 0;
+    clover_force = Zero();
+    for (int mu = 0; mu < 4; mu++)
+    {
+      force_mu = Zero();
+      for (int nu = 0; nu < 4; nu++)
+      {
+        if (mu == nu)
+        continue;
+        
+        RealD factor;
+        if (nu == 4 || mu == 4)
+        {
+          factor = 2.0 * csw_t;
+        }
+        else
+        {
+          factor = 2.0 * csw_r;
+        }
+        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
+        count++;
+      }
+
+      pokeLorentz(clover_force, U[mu] * force_mu, mu);
+    }
+    //clover_force *= csw;
+    force += clover_force;
+  }
+
+  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
+  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
+  {
+    conformable(lambda.Grid(), U[0].Grid());
+    GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
+    // insertion in upper staple
+    // please check redundancy of shift operations
+
+    // C1+
+    tmp = lambda * U[nu];
+    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C2+
+    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
+
+    // C3+
+    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
+
+    // C4+
+    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
+
+    // insertion in lower staple
+    // C1-
+    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C2-
+    tmp = adj(lambda) * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+
+    // C3-
+    tmp = lambda * U[nu];
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
+
+    // C4-
+    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
+
+    return out;
+  }
+
+private:
+  // here fixing the 4 dimensions, make it more general?
+
+  RealD csw_r;                                               // Clover coefficient - spatial
+  RealD csw_t;                                               // Clover coefficient - temporal
+  RealD diag_mass;                                           // Mass term
+  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
+  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
+  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
+  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
+  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
+
+  // eventually these can be compressed into 6x6 blocks instead of the 12x12
+  // using the DeGrand-Rossi basis for the gamma matrices
+  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_loop( (int i = 0; i < CloverTerm.Grid()->oSites(); i++),
+    {
+      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_loop( (int i = 0; i < CloverTerm.Grid()->oSites(); i++),
+    {
+      T_v[i]()(0, 1) = -F_v[i]()();
+      T_v[i]()(1, 0) = F_v[i]()();
+      T_v[i]()(2, 3) = -F_v[i]()();
+      T_v[i]()(3, 2) = F_v[i]()();
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXY(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_loop( (int i = 0; i < CloverTerm.Grid()->oSites(); i++),
+    {
+
+      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
+      T_v[i]()(1, 1) = timesI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverXT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_loop( (int i = 0; i < CloverTerm.Grid()->oSites(); i++),
+    {
+      T_v[i]()(0, 1) = timesI(F_v[i]()());
+      T_v[i]()(1, 0) = timesI(F_v[i]()());
+      T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverYT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+    T = Zero();
+    
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_loop( (int i = 0; i < CloverTerm.Grid()->oSites(); i++),
+    {
+      T_v[i]()(0, 1) = -(F_v[i]()());
+      T_v[i]()(1, 0) = (F_v[i]()());
+      T_v[i]()(2, 3) = (F_v[i]()());
+      T_v[i]()(3, 2) = -(F_v[i]()());
+    });
+
+    return T;
+  }
+
+  CloverFieldType fillCloverZT(const GaugeLinkField &F)
+  {
+    CloverFieldType T(F.Grid());
+
+    T = Zero();
+
+    auto T_v = T.View();
+    auto F_v = F.View();
+    thread_loop((int i = 0; i < CloverTerm.Grid()->oSites(); i++),
+    {
+      T_v[i]()(0, 0) = timesI(F_v[i]()());
+      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
+      T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
+      T_v[i]()(3, 3) = timesI(F_v[i]()());
+    });
+
+    return T;
+  }
+};
+NAMESPACE_END(Grid);
+
+
+
diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
similarity index 94%
rename from lib/qcd/action/fermion/WilsonCompressor.h
rename to Grid/qcd/action/fermion/WilsonCompressor.h
index f09cb471..f5a8f46b 100644
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -69,38 +69,48 @@ public:
   /* Compress includes precision change if mpi data is not same */
   /*****************************************************/
   accelerator_inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
-    projector::Proj(buf[o],in,mu,dag);
+    SiteHalfSpinor tmp;
+    projector::Proj(tmp,in,mu,dag);
+    vstream(buf[o],tmp);
   }
 
   /*****************************************************/
   /* Exchange includes precision change if mpi data is not same */
   /*****************************************************/
   accelerator_inline void Exchange(SiteHalfSpinor *mp,
-                       SiteHalfSpinor *vp0,
-                       SiteHalfSpinor *vp1,
-		       Integer type,Integer o){
-    exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
+				   const SiteHalfSpinor * __restrict__ vp0,
+				   const SiteHalfSpinor * __restrict__ vp1,
+				   Integer type,Integer o){
+    SiteHalfSpinor tmp1;
+    SiteHalfSpinor tmp2;
+    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
+    vstream(mp[2*o  ],tmp1);
+    vstream(mp[2*o+1],tmp2);
   }
 
   /*****************************************************/
   /* Have a decompression step if mpi data is not same */
   /*****************************************************/
-  accelerator_inline void Decompress(SiteHalfSpinor *out,
-			 SiteHalfSpinor *in, Integer o) {    
+  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
+				     SiteHalfSpinor * __restrict__ in, Integer o) {    
     assert(0);
   }
 
   /*****************************************************/
   /* Compress Exchange                                 */
   /*****************************************************/
-  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
-			       SiteHalfSpinor *out1,
-			       const SiteSpinor *in,
-			       Integer j,Integer k, Integer m,Integer type){
+  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
+					   SiteHalfSpinor * __restrict__ out1,
+					   const SiteSpinor * __restrict__ in,
+					   Integer j,Integer k, Integer m,Integer type)
+  {
     SiteHalfSpinor temp1, temp2;
+    SiteHalfSpinor temp3, temp4;
     projector::Proj(temp1,in[k],mu,dag);
     projector::Proj(temp2,in[m],mu,dag);
-    exchange(out0[j],out1[j],temp1,temp2,type);
+    exchange(temp3,temp4,temp1,temp2,type);
+    vstream(out0[j],temp3);
+    vstream(out1[j],temp4);
   }
 
   /*****************************************************/
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/Grid/qcd/action/fermion/WilsonFermion.cc
similarity index 77%
rename from lib/qcd/action/fermion/WilsonFermion.cc
rename to Grid/qcd/action/fermion/WilsonFermion.cc
index 5468a4a9..57581ff0 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion.cc
@@ -45,8 +45,10 @@ int WilsonFermionStatic::HandOptDslash;
 template <class Impl>
 WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                                    GridRedBlackCartesian &Hgrid, RealD _mass,
-                                   const ImplParams &p)
-  : Impl(p),
+                                   const ImplParams &p,
+                                   const WilsonAnisotropyCoefficients &anis)
+  : 
+    Kernels(p),
     _grid(&Fgrid),
     _cbgrid(&Hgrid),
     Stencil(&Fgrid, npoint, Even, directions, displacements),
@@ -58,16 +60,42 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
     Umu(&Fgrid),
     UmuEven(&Hgrid),
     UmuOdd(&Hgrid),
-    _tmp(&Hgrid)
+      _tmp(&Hgrid),
+      anisotropyCoeff(anis)
 {
   // Allocate the required comms buffer
   ImportGauge(_Umu);
+  if  (anisotropyCoeff.isAnisotropic){
+    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+  } else {
+    diag_mass = 4.0 + mass;
+  }
+
+
 }
 
 template <class Impl>
-void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
+void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) 
+{
   GaugeField HUmu(_Umu.Grid());
-  HUmu = _Umu * (-0.5);
+
+  //Here multiply the anisotropy coefficients
+  if (anisotropyCoeff.isAnisotropic)
+  {
+
+    for (int mu = 0; mu < Nd; mu++)
+    {
+      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
+      if (mu != anisotropyCoeff.t_direction)
+        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
+
+      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
+    }
+  }
+  else
+  {
+    HUmu = _Umu * (-0.5);
+  }
   Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
   pickCheckerboard(Even, UmuEven, Umu);
   pickCheckerboard(Odd, UmuOdd, Umu);
@@ -81,14 +109,14 @@ template <class Impl>
 RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerNo);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }
 
 template <class Impl>
 RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
   out.Checkerboard() = in.Checkerboard();
   Dhop(in, out, DaggerYes);
-  return axpy_norm(out, 4 + mass, in, out);
+  return axpy_norm(out, diag_mass, in, out);
 }
 
 template <class Impl>
@@ -112,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 template <class Impl>
 void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
   out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(4.0 + mass);
+  typename FermionField::scalar_type scal(diag_mass);
   out = scal * in;
 }
 
@@ -125,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
 template<class Impl>
 void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
   out.Checkerboard() = in.Checkerboard();
-  out = (1.0/(4.0+mass))*in;
+  out = (1.0/(diag_mass))*in;
 }
   
 template<class Impl>
@@ -134,7 +162,7 @@ void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
   MooeeInv(in,out);
 }
 template<class Impl>
-void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m) 
+void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
 {  
   typedef typename FermionField::vector_type vector_type;
   typedef typename FermionField::scalar_type ScalComplex;
@@ -167,6 +195,7 @@ void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const Fermi
     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
     
     kmu = TwoPiL * kmu;
+    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
     
     wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
     
@@ -316,7 +345,8 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
 };
 
 template <class Impl>
-void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) {
+void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) 
+{
   Compressor compressor(dag);
 
   Stencil.HaloExchange(in, compressor);
@@ -334,8 +364,102 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
                                        DoubledGaugeField &U,
                                        const FermionField &in,
                                        FermionField &out, int dag) {
-  assert((dag == DaggerNo) || (dag == DaggerYes));
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
+    DhopInternalOverlappedComms(st,lo,U,in,out,dag);
+  else
+#endif 
+    DhopInternalSerial(st,lo,U,in,out,dag);
 
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
+						      DoubledGaugeField &U,
+						      const FermionField &in,
+						      FermionField &out, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
+#ifdef GRID_OMP
+  Compressor compressor(dag);
+  int len =  U.Grid()->oSites();
+  const int LLs =  1;
+
+  st.Prepare();
+  st.HaloGather(in,compressor);
+  st.CommsMergeSHM(compressor);
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    int nthreads = omp_get_num_threads();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      nthreads -= ncomms;
+      int ttid  = tid - ncomms;
+      int n     = len;
+      int chunk = n / nthreads;
+      int rem   = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+        myblock = ttid * chunk + ttid;
+        myn = chunk+1;
+      } else {
+        myblock = ttid*chunk + rem;
+        myn = chunk;
+      }
+      // do the compute
+      auto U_v   = U.View();
+      auto in_v  = in.View();
+      auto out_v = out.View();
+      auto st_v  = st.View();
+      int Opt = WilsonKernelsStatic::Opt;
+
+      if (dag == DaggerYes) {
+        for (int sss = myblock; sss < myblock+myn; ++sss) {
+	  Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
+	  //	  Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
+	}
+      } else {
+        for (int sss = myblock; sss < myblock+myn; ++sss) {
+	  Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
+	  //	  Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
+	}
+      } 
+
+    } else {
+      st.CommunicateThreaded();
+    }
+  }  //pragma
+
+  {
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  =  st.View();
+    int Opt = WilsonKernelsStatic::Opt;
+    if (dag == DaggerYes) {
+      parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
+	Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
+      }
+    } else {
+      parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
+	Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
+      }
+    }
+  }
+#else
+  assert(0);
+#endif
+};
+
+
+template <class Impl>
+void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
+                                       DoubledGaugeField &U,
+                                       const FermionField &in,
+                                       FermionField &out, int dag) {
+  assert((dag == DaggerNo) || (dag == DaggerYes));
   Compressor compressor(dag);
   st.HaloExchange(in, compressor);
 
@@ -354,6 +478,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
     });
   }
 };
+/*Change ends */
 
 /*******************************************************************************
  * Conserved current utilities for Wilson fermions, for contracting propagators
@@ -396,40 +521,33 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
   });
 }
 
+
 template <class Impl>
 void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                               PropagatorField &q_out,
                                               Current curr_type,
                                               unsigned int mu,
-                                              std::vector<Real> mom,
                                               unsigned int tmin, 
-                                              unsigned int tmax)
+                                              unsigned int tmax,
+					      ComplexField &lattice_cmplx)
 {
   conformable(_grid, q_in.Grid());
   conformable(_grid, q_out.Grid());
-  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
+
+  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
   Complex i(0.0,1.0);
   PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
   unsigned int tshift = (mu == Tp) ? 1 : 0;
   unsigned int LLt    = GridDefaultLatt()[Tp];
 
-  // Momentum projection
-  ph = Zero();
-  for(unsigned int mu = 0; mu < Nd - 1; mu++)
-    {
-      LatticeCoordinate(coor, mu);
-      ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
-    }
-  ph = exp((Real)(2*M_PI)*i*ph);
-
   q_out = Zero();
   LatticeInteger coords(_grid);
   LatticeCoordinate(coords, Tp);
 
   // Need q(x + mu) and q(x - mu).
-  tmp = Cshift(q_in, mu, 1);
-  tmpFwd = tmp*ph;
-  tmp = ph*q_in;
+  tmp    = Cshift(q_in, mu, 1);
+  tmpFwd = tmp*lattice_cmplx;
+  tmp    = lattice_cmplx*q_in;
   tmpBwd = Cshift(tmp, mu, -1);
 
   auto coords_v = coords.View();
diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h
similarity index 82%
rename from lib/qcd/action/fermion/WilsonFermion.h
rename to Grid/qcd/action/fermion/WilsonFermion.h
index 9046b2c2..3a712435 100644
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -28,8 +28,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#ifndef GRID_QCD_WILSON_FERMION_H
-#define GRID_QCD_WILSON_FERMION_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -42,8 +41,24 @@ public:
   static const int npoint = 8;
 };
 
+ struct WilsonAnisotropyCoefficients: Serializable
+ {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
+  bool, isAnisotropic,
+  int, t_direction,
+  double, xi_0,
+  double, nu);
+
+  WilsonAnisotropyCoefficients():
+    isAnisotropic(false), 
+    t_direction(Nd-1), 
+    xi_0(1.0), 
+    nu(1.0){}
+};
+
 template <class Impl>
-class WilsonFermion : public WilsonFermionStatic, public Impl {
+class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic 
+{
 public:
   INHERIT_IMPL_TYPES(Impl);
   typedef WilsonKernels<Impl> Kernels;
@@ -63,8 +78,8 @@ public:
   // override multiply; cut number routines if pass dagger argument
   // and also make interface more uniformly consistent
   //////////////////////////////////////////////////////////////////
-  RealD M(const FermionField &in, FermionField &out);
-  RealD Mdag(const FermionField &in, FermionField &out);
+  virtual RealD M(const FermionField &in, FermionField &out);
+  virtual RealD Mdag(const FermionField &in, FermionField &out);
 
   /////////////////////////////////////////////////////////
   // half checkerboard operations
@@ -79,7 +94,7 @@ public:
   virtual void MooeeInv(const FermionField &in, FermionField &out);
   virtual void MooeeInvDag(const FermionField &in, FermionField &out);
 
-  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass) ;
+  virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) ;
 
   ////////////////////////
   // Derivative interface
@@ -113,10 +128,17 @@ public:
   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
                     const FermionField &in, FermionField &out, int dag);
 
+  void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+
+  void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+                    const FermionField &in, FermionField &out, int dag);
+
   // Constructor
   WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
                 GridRedBlackCartesian &Hgrid, RealD _mass,
-                const ImplParams &p = ImplParams());
+                const ImplParams &p = ImplParams(), 
+                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
 
   // DoubleStore impl dependent
   void ImportGauge(const GaugeField &_Umu);
@@ -127,7 +149,10 @@ public:
 
   //    protected:
 public:
+  virtual RealD Mass(void) { return mass; }
+  virtual int   isTrivialEE(void) { return 1; };
   RealD mass;
+  RealD diag_mass;
 
   GridBase *_grid;
   GridBase *_cbgrid;
@@ -145,6 +170,8 @@ public:
   LebesgueOrder Lebesgue;
   LebesgueOrder LebesgueEvenOdd;
   
+  WilsonAnisotropyCoefficients anisotropyCoeff;
+  
   ///////////////////////////////////////////////////////////////
   // Conserved current utilities
   ///////////////////////////////////////////////////////////////
@@ -157,9 +184,9 @@ public:
                            PropagatorField &q_out,
                            Current curr_type,
                            unsigned int mu, 
-                           std::vector<Real> mom,
                            unsigned int tmin,
-                           unsigned int tmax);
+                             unsigned int tmax,
+			     ComplexField &lattice_cmplx);
 };
 
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -167,4 +194,4 @@ typedef WilsonFermion<WilsonImplD> WilsonFermionD;
 
 NAMESPACE_END(Grid);
 
-#endif
+
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/Grid/qcd/action/fermion/WilsonFermion5D.cc
similarity index 59%
rename from lib/qcd/action/fermion/WilsonFermion5D.cc
rename to Grid/qcd/action/fermion/WilsonFermion5D.cc
index 01fde381..1c1afa81 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.cc
@@ -13,6 +13,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Andrew Lawson <andrew.lawson1991@gmail.com>
+Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -29,8 +30,8 @@ Author: Andrew Lawson <andrew.lawson1991@gmail.com>
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
+    *************************************************************************************/
+    /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion5D.h>
 #include <Grid/perfmon/PerfCount.h>
@@ -41,15 +42,15 @@ NAMESPACE_BEGIN(Grid);
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 
-// 5d lattice for DWF.
+  // 5d lattice for DWF.
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
-				       GridCartesian         &FiveDimGrid,
-				       GridRedBlackCartesian &FiveDimRedBlackGrid,
-				       GridCartesian         &FourDimGrid,
-				       GridRedBlackCartesian &FourDimRedBlackGrid,
-				       RealD _M5,const ImplParams &p) :
-  Impl(p),
+               GridCartesian         &FiveDimGrid,
+               GridRedBlackCartesian &FiveDimRedBlackGrid,
+               GridCartesian         &FourDimGrid,
+               GridRedBlackCartesian &FourDimRedBlackGrid,
+               RealD _M5,const ImplParams &p) :
+  Kernels(p),
   _FiveDimGrid        (&FiveDimGrid),
   _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
   _FourDimGrid        (&FourDimGrid),
@@ -103,8 +104,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
     assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 
     for(int d=0;d<4;d++){
-      assert(FourDimGrid._simd_layout[d]==1);
-      assert(FourDimRedBlackGrid._simd_layout[d]==1);
+      assert(FourDimGrid._simd_layout[d]=1);
+      assert(FourDimRedBlackGrid._simd_layout[d]=1);
       assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
     }
 
@@ -126,10 +127,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 
   vol4=FourDimRedBlackGrid.oSites();
   StencilEven.BuildSurfaceList(LLs,vol4);
-  StencilOdd.BuildSurfaceList(LLs,vol4);
+   StencilOdd.BuildSurfaceList(LLs,vol4);
 
-  //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-  //                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 
 }
      
@@ -164,7 +165,7 @@ void WilsonFermion5D<Impl>::Report(void)
     std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
     std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
 
-  }
+   }
 
   if ( DerivCalls > 0 ) {
     std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
@@ -244,25 +245,27 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
   assert(dirdisp<=7);
   assert(dirdisp>=0);
 
-  auto Umu_v = Umu.View();
-  auto in_v  = in.View();
-  auto out_v = out.View();
-  thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
+   auto Umu_v = Umu.View();
+   auto in_v  = in.View();
+   auto out_v = out.View();
+   thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
+       //  parallel_for(int ss=0;ss<Umu.Grid()->oSites();ss++){
     for(int s=0;s<Ls;s++){
       int sU=ss;
       int sF = s+Ls*sU; 
       Kernels::DhopDirK(Stencil,Umu_v,Stencil.CommBuf(),sF,sU,in_v,out_v,dirdisp,gamma);
+      //      Kernels::DhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
     }
   });
 };
 
 template<class Impl>
 void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
-					  DoubledGaugeField & U,
-					  GaugeField &mat,
-					  const FermionField &A,
-					  const FermionField &B,
-					  int dag)
+            DoubledGaugeField & U,
+            GaugeField &mat,
+            const FermionField &A,
+            const FermionField &B,
+            int dag)
 {
   DerivCalls++;
   assert((dag==DaggerNo) ||(dag==DaggerYes));
@@ -282,6 +285,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
   Atilde=A;
   int LLs = B.Grid()->_rdimensions[0];
 
+
   DerivComputeTime-=usecond();
   for (int mu = 0; mu < Nd; mu++) {
     ////////////////////////////////////////////////////////////////////////
@@ -295,12 +299,14 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
     ////////////////////////
 
     DerivDhopComputeTime -= usecond();
+
     auto U_v = U.View();
     auto Btilde_v = Btilde.View();
     auto B_v = B.View();
     int Bsites = B.Grid()->oSites();
     int Usites = U.Grid()->oSites();
     thread_loop( (int sss = 0; sss < U.Grid()->oSites(); sss++) ,{
+	//    parallel_for (int sss = 0; sss < U.Grid()->oSites(); sss++) {
       for (int s = 0; s < Ls; s++) {
         int sU = sss;
         int sF = s + Ls * sU;
@@ -308,8 +314,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
         assert(sF < Bsites);
         assert(sU < Usites);
 
-        Kernels::DhopDirK(st, U_v, st.CommBuf(), sF, sU, B_v, Btilde_v, mu, gamma);
-
+	Kernels::DhopDirK(st, U_v, st.CommBuf(), sF, sU, B_v, Btilde_v, mu, gamma);
+	//        Kernels::DhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
         ////////////////////////////
         // spin trace outer product
         ////////////////////////////
@@ -319,7 +325,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
     // spin trace outer product
     ////////////////////////////
     DerivDhopComputeTime += usecond();
-    this->Impl::InsertForce5D(mat, Btilde, Atilde, mu);
+    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
   }
   DerivComputeTime += usecond();
 }
@@ -336,6 +342,7 @@ void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
   //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
 
   mat.Checkerboard() = A.Checkerboard();
+  //  mat.checkerboard = A.checkerboard;
 
   DerivInternal(Stencil,Umu,mat,A,B,dag);
 }
@@ -416,6 +423,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   auto U_v   = U.View();
   auto in_v  = in.View();
   auto out_v = out.View();
+  int Opt = WilsonKernelsStatic::Opt;
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
   { 
     int tid = omp_get_thread_num();
@@ -440,18 +448,19 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
       }
       
       // do the compute
-      int Opt = WilsonKernelsStatic::Opt;
       if (dag == DaggerYes) {
 	for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  int sU = ss;
 	  int sF = LLs * sU;
-	  Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
+ 	  Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
+	  //	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 	}
       } else {
 	for (int ss = myblock; ss < myblock+myn; ++ss) {
 	  int sU = ss;
 	  int sF = LLs * sU;
 	  Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
+	  //	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 	}
       }
       ptime = usecond() - start;
@@ -473,20 +482,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   DhopFaceTime+=usecond();
 
   DhopComputeTime2-=usecond();
-  int Opt = WilsonKernelsStatic::Opt;
   if (dag == DaggerYes) {
     int sz=st.surface_list.size();
-    int Opt = WilsonKernelsStatic::Opt;
     thread_loop( (int ss = 0; ss < sz; ss++) ,{
+	//    parallel_for (int ss = 0; ss < sz; ss++) {
       int sU = st.surface_list[ss];
       int sF = LLs * sU;
       Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
+      //      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
     });
   } else {
     int sz=st.surface_list.size();
     thread_loop( (int ss = 0; ss < sz; ss++) ,{
+	//    parallel_for (int ss = 0; ss < sz; ss++) {
       int sU = st.surface_list[ss];
       int sF = LLs * sU;
+      //      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
       Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
     });
   }
@@ -499,8 +510,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
-						    DoubledGaugeField & U,
-						    const FermionField &in, FermionField &out,int dag)
+					 DoubledGaugeField & U,
+					 const FermionField &in, FermionField &out,int dag)
 {
   //  assert((dag==DaggerNo) ||(dag==DaggerYes));
   Compressor compressor(dag);
@@ -513,12 +524,23 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
   
   DhopComputeTime-=usecond();
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+
   auto U_v = U.View();
   int Opt = WilsonKernelsStatic::Opt;
   if (dag == DaggerYes) {
     Kernels::DhopDag(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
+    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+    //      int sU = ss;
+    //      int sF = LLs * sU;
+    //      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
+    //    }
   } else {
     Kernels::Dhop(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
+    //    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+    //      int sU = ss;
+    //      int sF = LLs * sU;
+    //      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
+    //    }
   }
   DhopComputeTime+=usecond();
 }
@@ -568,7 +590,221 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 }
 
 template<class Impl>
-void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const FermionField &in, RealD mass) 
+void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
+{
+  // what type LatticeComplex 
+  GridBase *_grid = _FourDimGrid;
+  GridBase *_5dgrid = _FiveDimGrid;
+
+  conformable(_5dgrid,out.Grid());
+
+  FermionField   PRsource(_5dgrid);
+  FermionField   PLsource(_5dgrid);
+  FermionField   buf1_4d(_grid);
+  FermionField   buf2_4d(_grid);
+  FermionField   GR(_5dgrid);
+  FermionField   GL(_5dgrid);
+  FermionField   bufL_4d(_grid);
+  FermionField   bufR_4d(_grid);
+
+  unsigned int Ls = in.Grid()->_rdimensions[0];
+  
+  typedef typename FermionField::vector_type vector_type;
+  typedef typename FermionField::scalar_type ScalComplex;
+  typedef iSinglet<ScalComplex> Tcomplex;
+  typedef Lattice<iSinglet<vector_type> > LatComplex;
+  
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+  Gamma g5(Gamma::Algebra::Gamma5);
+
+  Coordinate latt_size   = _grid->_fdimensions;
+
+  LatComplex    sk(_grid);  sk = Zero();
+  LatComplex    sk2(_grid); sk2= Zero();
+  LatComplex    W(_grid); W= Zero();
+  LatComplex    a(_grid); a= Zero();
+  LatComplex    one  (_grid); one = ScalComplex(1.0,0.0);
+  LatComplex 	cosha(_grid);
+  LatComplex 	kmu(_grid);
+  LatComplex 	Wea(_grid);
+  LatComplex 	Wema(_grid);
+  LatComplex 	sinha(_grid);
+  LatComplex 	sinhaLs(_grid);
+  LatComplex 	coshaLs(_grid);
+  LatComplex 	A(_grid);
+  LatComplex 	F(_grid);
+  LatComplex 	App(_grid);
+  LatComplex 	Amm(_grid);
+  LatComplex 	Bpp(_grid);
+  LatComplex 	Bmm(_grid);
+  LatComplex 	ABpm(_grid); //Apm=Amp=Bpm=Bmp
+  LatComplex 	signW(_grid);
+
+  ScalComplex ci(0.0,1.0);
+
+  for(int mu=0;mu<Nd;mu++) {
+    
+    LatticeCoordinate(kmu,mu);
+    
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    
+    kmu = TwoPiL * kmu;
+    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
+    
+    sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
+    sk  = sk  +     sin(kmu)    *sin(kmu);
+  }
+  
+  W = one - M5 + sk2;
+
+  ////////////////////////////////////////////
+  // Cosh alpha -> alpha
+  ////////////////////////////////////////////
+  cosha = (one + W*W + sk) / (abs(W)*2.0);
+
+  // FIXME Need a Lattice acosh
+  for(int idx=0;idx<_grid->lSites();idx++){
+    Coordinate lcoor(Nd);
+    Tcomplex cc;
+    RealD sgn;
+    _grid->LocalIndexToLocalCoor(idx,lcoor);
+    peekLocalSite(cc,cosha,lcoor);
+    assert((double)real(cc)>=1.0);
+    assert(fabs((double)imag(cc))<=1.0e-15);
+    cc = ScalComplex(::acosh(real(cc)),0.0);
+    pokeLocalSite(cc,a,lcoor);
+  }
+
+  Wea = ( exp( a) * abs(W)  );
+  Wema= ( exp(-a) * abs(W)  );
+  sinha = 0.5*(exp( a) - exp(-a));
+  sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
+  coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
+
+  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
+  F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
+  F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
+  F = F - abs(W) * sinha * 4.0 * mass;
+
+  Bpp =  (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
+  Bmm =  (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
+  App =  (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
+  Amm =  (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
+  ABpm = (A/F) * abs(W) * sinha * 2.0  * (one + mass * coshaLs * 2.0 + mass*mass * one);
+
+  //P+ source, P- source
+  PRsource = (in + g5 * in) * 0.5;
+  PLsource = (in - g5 * in) * 0.5;
+
+  //calculate GR, GL
+  for(unsigned int ss=1;ss<=Ls;ss++)
+  {
+    bufR_4d = Zero();
+    bufL_4d = Zero();
+    for(unsigned int tt=1;tt<=Ls;tt++)
+    {
+      //possible sign if W<0
+      if((ss+tt)%2==1) signW = abs(W)/W;
+      else signW = one;
+
+      unsigned int f = (ss > tt) ? ss-tt : tt-ss; //f = abs(ss-tt)
+      //GR
+      buf1_4d = Zero();
+      ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
+      //G(s,t)
+      bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
+      //A++*exp(a(s+t))
+      bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
+      //A+-*exp(a(s-t))
+      bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
+      //A-+*exp(a(-s+t))
+      bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
+      //A--*exp(a(-s-t))
+      bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
+
+      //GL
+      buf2_4d = Zero();
+      ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
+      //G(s,t)
+      bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
+      //B++*exp(a(s+t))
+      bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
+      //B+-*exp(a(s-t))
+      bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
+      //B-+*exp(a(-s+t))
+      bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
+      //B--*exp(a(-s-t))
+      bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
+    }
+    InsertSlice(bufR_4d, GR, (ss-1), 0);
+    InsertSlice(bufL_4d, GL, (ss-1), 0);
+  }
+
+//calculate propagator
+  for(unsigned int ss=1;ss<=Ls;ss++)
+  {
+    bufR_4d = Zero();
+    bufL_4d = Zero();
+
+    //(i*gamma_mu*sin(p_mu) - W)*(GL*P- source)
+    buf1_4d = Zero();
+    ExtractSlice(buf1_4d, GL, (ss-1), 0);
+    buf2_4d = Zero();
+    for(int mu=0;mu<Nd;mu++) {
+      LatticeCoordinate(kmu,mu);
+      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      kmu = TwoPiL * kmu + TwoPiL * one * twist[mu];//twisted boundary
+      buf2_4d = buf2_4d + sin(kmu)*ci*(Gamma(Gmu[mu])*buf1_4d);
+    }
+    bufL_4d = buf2_4d - W * buf1_4d;
+
+    //(i*gamma_mu*sin(p_mu) - W)*(GR*P+ source)
+    buf1_4d = Zero();
+    ExtractSlice(buf1_4d, GR, (ss-1), 0);
+    buf2_4d = Zero();
+    for(int mu=0;mu<Nd;mu++) {
+      LatticeCoordinate(kmu,mu);
+      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      kmu = TwoPiL * kmu + TwoPiL * one * twist[mu];//twisted boundary
+      buf2_4d = buf2_4d + sin(kmu)*ci*(Gamma(Gmu[mu])*buf1_4d);
+    }
+    bufR_4d = buf2_4d - W * buf1_4d;
+
+    //(delta(s-1,u) - m*delta(s,1)*delta(u,Ls))*GL
+    if(ss==1){
+      ExtractSlice(buf1_4d, GL, (Ls-1), 0);
+      bufL_4d = bufL_4d - mass*buf1_4d;
+    }
+    else {
+      ExtractSlice(buf1_4d, GL, (ss-2), 0);
+      bufL_4d = bufL_4d + buf1_4d;
+    }
+
+    //(delta(s+1,u) - m*delta(s,Ls)*delta(u,1))*GR
+    if(ss==Ls){
+      ExtractSlice(buf1_4d, GR, 0, 0);
+      bufR_4d = bufR_4d - mass*buf1_4d;
+    }
+    else {
+      ExtractSlice(buf1_4d, GR, ss, 0);
+      bufR_4d = bufR_4d + buf1_4d;
+    }
+    buf1_4d = bufL_4d + bufR_4d;
+    InsertSlice(buf1_4d, out, (ss-1), 0);
+  }
+
+
+  out = out * (-1.0);
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
 {
   // what type LatticeComplex 
   GridBase *_grid = _FourDimGrid;
@@ -586,7 +822,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
     Gamma::Algebra::GammaT
   };
 
-  Coordinate latt_size   = _grid->FullDimensions();
+  Coordinate latt_size   = _grid->_fdimensions;
+
   
   FermionField   num  (_grid); num  = Zero();
 
@@ -610,6 +847,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
     
     kmu = TwoPiL * kmu;
+    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
     
     sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
     sk  = sk  +     sin(kmu)    *sin(kmu); 
@@ -623,12 +861,13 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
   ////////////////////////////////////////////
   // Cosh alpha -> alpha
   ////////////////////////////////////////////
-  cosha =  (one + W*W + sk) / (W*2.0);
+  cosha =  (one + W*W + sk) / (abs(W)*2.0);
 
   // FIXME Need a Lattice acosh
   for(int idx=0;idx<_grid->lSites();idx++){
     Coordinate lcoor(Nd);
     Tcomplex cc;
+    RealD sgn;
     _grid->LocalIndexToLocalCoor(idx,lcoor);
     peekLocalSite(cc,cosha,lcoor);
     assert((double)real(cc)>=1.0);
@@ -637,8 +876,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
     pokeLocalSite(cc,a,lcoor);
   }
   
-  Wea = ( exp( a) * W  ); 
-  Wema= ( exp(-a) * W  ); 
+  Wea = ( exp( a) * abs(W)  );
+  Wema= ( exp(-a) * abs(W)  );
   
   num   = num + ( one - Wema ) * mass * in;
   denom= ( Wea - one ) + mass*mass * (one - Wema); 
@@ -646,63 +885,64 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
 }
 
 template<class Impl>
-void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass) 
+void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
 {
-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
+    Gamma::Algebra Gmu [] = {
+      Gamma::Algebra::GammaX,
+      Gamma::Algebra::GammaY,
+      Gamma::Algebra::GammaZ,
+      Gamma::Algebra::GammaT
+    };
 
-  GridBase *_grid = _FourDimGrid;
-  conformable(_grid,out.Grid());
+    GridBase *_grid = _FourDimGrid;
+    conformable(_grid,out.Grid());
 
-  typedef typename FermionField::vector_type vector_type;
-  typedef typename FermionField::scalar_type ScalComplex;
+    typedef typename FermionField::vector_type vector_type;
+    typedef typename FermionField::scalar_type ScalComplex;
 
-  typedef Lattice<iSinglet<vector_type> > LatComplex;
+    typedef Lattice<iSinglet<vector_type> > LatComplex;
 
 
-  Coordinate latt_size   = _grid->FullDimensions();
+    Coordinate latt_size   = _grid->_fdimensions;
 
-  LatComplex    sk(_grid);  sk = Zero();
-  LatComplex    sk2(_grid); sk2= Zero();
+    LatComplex    sk(_grid);  sk = Zero();
+    LatComplex    sk2(_grid); sk2= Zero();
 
-  LatComplex    w_k(_grid); w_k= Zero();
-  LatComplex    b_k(_grid); b_k= Zero();
+    LatComplex    w_k(_grid); w_k= Zero();
+    LatComplex    b_k(_grid); b_k= Zero();
 
-  LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
+    LatComplex     one  (_grid); one = ScalComplex(1.0,0.0);
 
-  FermionField   num  (_grid); num  = Zero();
-  LatComplex denom(_grid); denom= Zero();
-  LatComplex kmu(_grid); 
-  ScalComplex ci(0.0,1.0);
+    FermionField   num  (_grid); num  = Zero();
+    LatComplex denom(_grid); denom= Zero();
+    LatComplex kmu(_grid); 
+    ScalComplex ci(0.0,1.0);
 
-  for(int mu=0;mu<Nd;mu++) {
+    for(int mu=0;mu<Nd;mu++) {
 
-    LatticeCoordinate(kmu,mu);
+      LatticeCoordinate(kmu,mu);
 
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 
-    kmu = TwoPiL * kmu;
+      kmu = TwoPiL * kmu;
+      kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
 
-    sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
-    sk  = sk  + sin(kmu)*sin(kmu); 
+      sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
+      sk  = sk  + sin(kmu)*sin(kmu); 
 
-    num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
+      num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
 
-  }
-  num = num + mass * in ;
+    }
+    num = num + mass * in ;
 
-  b_k = sk2 - M5;
+    b_k = sk2 - M5;
      
-  w_k = sqrt(sk + b_k*b_k);
+    w_k = sqrt(sk + b_k*b_k);
 
-  denom= ( w_k + b_k + mass*mass) ;
+    denom= ( w_k + b_k + mass*mass) ;
 
-  denom= one/denom;
-  out = num*denom;
+    denom= one/denom;
+    out = num*denom;
 
 }
 
@@ -713,18 +953,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
  ******************************************************************************/
 
 // Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
-#define REVERSE_LS(qSite, qSiteRev, Nsimd)				\
-  {									\
-    ExtractBuffer<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
-    extract(qSite, qSiteVec);						\
-    for (int i = 0; i < Nsimd / 2; ++i)					\
-      {									\
-        typename SitePropagator::scalar_object tmp = qSiteVec[i];	\
-        qSiteVec[i] = qSiteVec[Nsimd - i - 1];				\
-        qSiteVec[Nsimd - i - 1] = tmp;					\
-      }									\
-    merge(qSiteRev, qSiteVec);						\
-  }
+#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
+{ \
+    ExtractBuffer<typename SitePropagator::scalar_object> qSiteVec(Nsimd);	\
+    extract(qSite, qSiteVec); \
+    for (int i = 0; i < Nsimd / 2; ++i) \
+    { \
+        typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
+        qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
+        qSiteVec[Nsimd - i - 1] = tmp; \
+    } \
+    merge(qSiteRev, qSiteVec); \
+}
 
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
@@ -733,142 +973,140 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                      Current curr_type,
                                                      unsigned int mu)
 {
-  conformable(q_in_1.Grid(), FermionGrid());
-  conformable(q_in_1.Grid(), q_in_2.Grid());
-  conformable(_FourDimGrid, q_out.Grid());
-  PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
-  unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
-  q_out = Zero();
+    conformable(q_in_1.Grid(), FermionGrid());
+    conformable(q_in_1.Grid(), q_in_2.Grid());
+    conformable(_FourDimGrid, q_out.Grid());
+    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
+    unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
+    q_out = Zero();
 
-  auto q_in_1_v = q_in_1.View();
-  auto q_in_2_v = q_in_2.View();
-  auto tmp1_v   = tmp1.View();
-  auto tmp2_v   = tmp2.View();
-  auto q_out_v  = q_out.View();
-  auto Umu_v    = Umu.View();
-  // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
-  // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
-  tmp1 = Cshift(q_in_1, mu + 1, 1);
-  tmp2 = Cshift(q_in_2, mu + 1, 1);
+    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
+    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
+    tmp1 = Cshift(q_in_1, mu + 1, 1);
+    tmp2 = Cshift(q_in_2, mu + 1, 1);
+    auto q_in_1_v = q_in_1.View();
+    auto q_in_2_v = q_in_2.View();
+    auto tmp1_v   = tmp1.View();
+    auto tmp2_v   = tmp2.View();
+    auto q_out_v  = q_out.View();
+    auto Umu_v    = Umu.View();
+    thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) , {
 
-  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
-    unsigned int sF1 = sU * LLs;
-    unsigned int sF2 = (sU + 1) * LLs - 1;
-    
-    for (unsigned int s = 0; s < LLs; ++s) {
+        unsigned int sF1 = sU * LLs;
+        unsigned int sF2 = (sU + 1) * LLs - 1;
 
-      bool axial_sign = ((curr_type == Current::Axial) &&	\
-			 (s < (LLs / 2)));
-      SitePropagator qSite2, qmuSite2;
-      
-      // If vectorised in 5th dimension, reverse q2 vector to match up
-      // sites correctly.
-      if (Impl::LsVectorised) {
-	REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
-	REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
-      } else {
-	qSite2   = q_in_2_v[sF2];
-	qmuSite2 = tmp2_v[sF2];
-      }
-      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1], 
-					       qSite2, 
-					       q_out_v[sU],
-					       Umu_v, sU, mu, axial_sign);
-      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
-					       qmuSite2,
-					       q_out_v[sU],
-					       Umu_v, sU, mu, axial_sign);
-      sF1++;
-      sF2--;
-    }
-  });
+        for (unsigned int s = 0; s < LLs; ++s)
+        {
+            bool axial_sign = ((curr_type == Current::Axial) && \
+                               (s < (LLs / 2)));
+            SitePropagator qSite2, qmuSite2;
+
+            // If vectorised in 5th dimension, reverse q2 vector to match up
+            // sites correctly.
+            if (Impl::LsVectorised)
+            {
+                REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
+                REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
+            }
+            else
+            {
+                qSite2   = q_in_2_v[sF2];
+                qmuSite2 = tmp2_v[sF2];
+            }
+            Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1], 
+                                                     qSite2, 
+                                                     q_out_v[sU],
+                                                     Umu_v, sU, mu, axial_sign);
+            Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
+                                                     qmuSite2,
+                                                     q_out_v[sU],
+                                                     Umu_v, sU, mu, axial_sign);
+            sF1++;
+            sF2--;
+        }
+    });
 }
 
 
+
 template <class Impl>
 void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                                 PropagatorField &q_out,
                                                 Current curr_type, 
                                                 unsigned int mu,
-                                                std::vector<Real> mom,
                                                 unsigned int tmin, 
-                                                unsigned int tmax)
+                                                unsigned int tmax,
+						ComplexField &lattice_cmplx)
 {
-  conformable(q_in.Grid(), FermionGrid());
-  conformable(q_in.Grid(), q_out.Grid());
-  Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
-  PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
-    tmp(FermionGrid());
-  Complex i(0.0, 1.0);
-  unsigned int tshift = (mu == Tp) ? 1 : 0;
-  unsigned int LLs = q_in.Grid()->_rdimensions[0];
-  unsigned int LLt    = GridDefaultLatt()[Tp];
+    conformable(q_in.Grid(), FermionGrid());
+    conformable(q_in.Grid(), q_out.Grid());
+    PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLs = q_in.Grid()->_rdimensions[0];
+    unsigned int LLt    = GridDefaultLatt()[Tp];
 
-  // Momentum projection.
-  ph = Zero();
-  for(unsigned int nu = 0; nu < Nd - 1; nu++)
-    {
-      // Shift coordinate lattice index by 1 to account for 5th dimension.
-      LatticeCoordinate(coor, nu + 1);
-      ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
-    }
-  ph = exp((Real)(2*M_PI)*i*ph);
-
-  q_out = Zero();
-  LatticeInteger coords(_FourDimGrid);
-  LatticeCoordinate(coords, Tp);
-  auto coords_v = coords.View();
-  // Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
-  // by one.
-  tmp = Cshift(q_in, mu + 1, 1);
-  tmpFwd = tmp*ph;
-  tmp = ph*q_in;
-  tmpBwd = Cshift(tmp, mu + 1, -1);
-
-  auto tmpBwd_v = tmpBwd.View();
-  auto tmpFwd_v = tmpFwd.View();
-  auto q_out_v  = q_out.View();
-  auto Umu_v    = Umu.View();
-  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) ,{
-    // Compute the sequential conserved current insertion only if our simd
-    // object contains a timeslice we need.
-    vInteger t_mask   = ((coords_v[sU] >= tmin) &&
-			 (coords_v[sU] <= tmax));
-    Integer timeSlices = Reduce(t_mask);
-      
-    if (timeSlices > 0) {
-
-      unsigned int sF = sU * LLs;
-      for (unsigned int s = 0; s < LLs; ++s) {
-	bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
-	Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sF], 
-					    q_out_v[sF], Umu_v, sU,
-					    mu, t_mask, axial_sign);
-	++sF;
-      }
-    }
-
-    // Repeat for backward direction.
-    t_mask     = ((coords_v[sU] >= (tmin + tshift)) && 
-		  (coords_v[sU] <= (tmax + tshift)));
-
-    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-    unsigned int t0 = 0;
-    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords_v[sU] == t0 ));
+    q_out = Zero();
+    LatticeInteger coords(_FourDimGrid);
+    LatticeCoordinate(coords, Tp);
     
-    timeSlices = Reduce(t_mask);
+    auto q_out_v = q_out.View();
+    auto tmp2_v  = tmp2.View();
+    auto coords_v= coords.View();
+    auto Umu_v   = Umu.View();
+    for (unsigned int s = 0; s < LLs; ++s)
+    {
+        bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+	bool tadpole_sign = (curr_type == Current::Tadpole);
+	bool switch_sgn = tadpole_sign || axial_sign;
 
-    if (timeSlices > 0) {
-      unsigned int sF = sU * LLs;
-      for (unsigned int s = 0; s < LLs; ++s) {
-	bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
-	Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sF], 
-					    q_out_v[sF], Umu_v, sU,
-					    mu, t_mask, axial_sign);
-	++sF;
-      }
+
+        //forward direction: Need q(x + mu, s)*A(x)
+        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s) 
+        tmp = Cshift(tmp2, mu, 1);	 //q(x+mu,s)
+        tmp2 = tmp*lattice_cmplx;	 //q(x+mu,s)*A(x)	
+
+    	thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
+            // Compute the sequential conserved current insertion only if our simd
+            // object contains a timeslice we need.
+            vInteger t_mask   = ((coords_v[sU] >= tmin) &&
+                	         (coords_v[sU] <= tmax));
+            Integer timeSlices = Reduce(t_mask);
+
+            if (timeSlices > 0)
+            {
+		unsigned int sF = sU * LLs + s;
+                Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU], 
+                                              q_out_v[sF], Umu_v, sU,
+                                              mu, t_mask, switch_sgn);
+            }
+
+        });
+
+        //backward direction: Need q(x - mu, s)*A(x-mu)
+        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s)
+        tmp = lattice_cmplx*tmp2;	 //q(x,s)*A(x)
+        tmp2 = Cshift(tmp, mu, -1);	 //q(x-mu,s)*A(x-mu,s)
+
+    	thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU),
+    	{
+            vInteger  t_mask     = ((coords_v[sU] >= (tmin + tshift)) && 
+                   	  	    (coords_v[sU] <= (tmax + tshift)));
+
+	    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	    unsigned int t0 = 0;
+	    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords_v[sU] == t0 ));
+
+            Integer timeSlices = Reduce(t_mask);
+
+            if (timeSlices > 0)
+            {
+		unsigned int sF = sU * LLs + s; 
+        	Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU], 
+                                             q_out_v[sF], Umu_v, sU,
+                                             mu, t_mask, axial_sign);
+            }
+	});
     }
-  });
 }
 
 FermOpTemplateInstantiate(WilsonFermion5D);
@@ -878,3 +1116,4 @@ NAMESPACE_END(Grid);
 
 
 
+
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h
similarity index 93%
rename from lib/qcd/action/fermion/WilsonFermion5D.h
rename to Grid/qcd/action/fermion/WilsonFermion5D.h
index c7c907ed..bd97ea86 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -66,7 +66,7 @@ public:
 };
 
 template<class Impl>
-class WilsonFermion5D : public WilsonFermion5DStatic, public Impl
+class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
 {
 public:
   INHERIT_IMPL_TYPES(Impl);
@@ -117,8 +117,9 @@ public:
   virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
   virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 
-  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass) ;
-  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass) ;
+      void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+      void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+      void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
 
   // Implement hopping term non-hermitian hopping term; half cb or both
   // Implement s-diagonal DW
@@ -225,9 +226,9 @@ public:
 			   PropagatorField &q_out,
 			   Current curr_type,
 			   unsigned int mu,
-			   std::vector<Real> mom,
 			   unsigned int tmin,
-			   unsigned int tmax);
+                             unsigned int tmax,
+			     ComplexField &lattice_cmplx);
 };
 
 NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/Grid/qcd/action/fermion/WilsonKernels.cc
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernels.cc
rename to Grid/qcd/action/fermion/WilsonKernels.cc
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h
similarity index 97%
rename from lib/qcd/action/fermion/WilsonKernels.h
rename to Grid/qcd/action/fermion/WilsonKernels.h
index a0922934..c12b8c2f 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -28,8 +28,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#ifndef GRID_QCD_DHOP_H
-#define GRID_QCD_DHOP_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -45,10 +44,11 @@ public:
   static int Comms;
 };
  
-template<class Impl> class WilsonKernels { 
+template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
 public:
 
   INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
    
 public:
 
@@ -58,7 +58,6 @@ public:
   static void DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int Ls, int Nsite, const FermionField &in, FermionField &out,
 		      int interior=1,int exterior=1) ;
-
    
   template <bool EnableBool = true> static accelerator
   typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
@@ -110,7 +109,8 @@ public:
   template <bool EnableBool = true> static accelerator
   typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
   DhopSite(int Opt, StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-	   int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out,int interior=1,int exterior=1 ) {
+	   int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out,int interior=1,int exterior=1 ) 
+  {
     // no kernel choice  
     for (int site = 0; site < Nsite; site++) {
       for (int s = 0; s < Ls; s++) {
@@ -173,8 +173,8 @@ public:
   template <bool EnableBool = true> static accelerator
   typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
   DhopSiteDag(int Opt,StencilView &st,  DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
-	      int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out,int interior=1,int exterior=1) {
-
+	      int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out,int interior=1,int exterior=1) 
+  {
     for (int site = 0; site < Nsite; site++) {
       for (int s = 0; s < Ls; s++) {
 	if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,U,buf,sF,sU,in,out);
@@ -288,8 +288,10 @@ private:
   static accelerator void HandDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
 					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
 
+ public:
+ WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
 };
     
 NAMESPACE_END(Grid);
 
-#endif
+
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernelsAsm.cc
rename to Grid/qcd/action/fermion/WilsonKernelsAsm.cc
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h b/Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
rename to Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernelsAsmBody.h
rename to Grid/qcd/action/fermion/WilsonKernelsAsmBody.h
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
rename to Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc b/Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
rename to Grid/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmQPX.h b/Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
similarity index 100%
rename from lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
rename to Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h
diff --git a/lib/qcd/action/fermion/WilsonKernelsGpu.cc b/Grid/qcd/action/fermion/WilsonKernelsGpu.cc
similarity index 99%
rename from lib/qcd/action/fermion/WilsonKernelsGpu.cc
rename to Grid/qcd/action/fermion/WilsonKernelsGpu.cc
index 6f06af5b..5d849401 100644
--- a/lib/qcd/action/fermion/WilsonKernelsGpu.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsGpu.cc
@@ -59,13 +59,13 @@ accelerator_inline int get_my_lane_offset(int Nsimd)
 
 accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
 {
-#if 0
-  chip = *mem;
-#else 
-  assert(sizeof(StencilEntry)==sizeof(uint4));
+#ifdef __CUDA_ARCH__
+  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
   uint4 * mem_pun  = (uint4 *)mem;
   uint4 * chip_pun = (uint4 *)&chip;
   * chip_pun = * mem_pun;
+#else 
+  chip = *mem;
 #endif
   return;
 }
diff --git a/Grid/qcd/action/fermion/WilsonKernelsHand.cc b/Grid/qcd/action/fermion/WilsonKernelsHand.cc
new file mode 100644
index 00000000..2046dc05
--- /dev/null
+++ b/Grid/qcd/action/fermion/WilsonKernelsHand.cc
@@ -0,0 +1,630 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+#define REGISTER
+
+#define LOAD_CHIMU \
+  {const SiteSpinor & ref (in[offset]);	\
+    Chimu_00=ref()(0)(0);\
+    Chimu_01=ref()(0)(1);\
+    Chimu_02=ref()(0)(2);\
+    Chimu_10=ref()(1)(0);\
+    Chimu_11=ref()(1)(1);\
+    Chimu_12=ref()(1)(2);\
+    Chimu_20=ref()(2)(0);\
+    Chimu_21=ref()(2)(1);\
+    Chimu_22=ref()(2)(2);\
+    Chimu_30=ref()(3)(0);\
+    Chimu_31=ref()(3)(1);\
+    Chimu_32=ref()(3)(2);}
+
+#define LOAD_CHI\
+  {const SiteHalfSpinor &ref(buf[offset]);	\
+    Chi_00 = ref()(0)(0);\
+    Chi_01 = ref()(0)(1);\
+    Chi_02 = ref()(0)(2);\
+    Chi_10 = ref()(1)(0);\
+    Chi_11 = ref()(1)(1);\
+    Chi_12 = ref()(1)(2);}
+
+// To splat or not to splat depends on the implementation
+#define MULT_2SPIN(A)\
+  {auto & ref(U[sU](A));			\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
+    UChi_00 = U_00*Chi_00;\
+    UChi_10 = U_00*Chi_10;\
+    UChi_01 = U_10*Chi_00;\
+    UChi_11 = U_10*Chi_10;\
+    UChi_02 = U_20*Chi_00;\
+    UChi_12 = U_20*Chi_10;\
+    UChi_00+= U_01*Chi_01;\
+    UChi_10+= U_01*Chi_11;\
+    UChi_01+= U_11*Chi_01;\
+    UChi_11+= U_11*Chi_11;\
+    UChi_02+= U_21*Chi_01;\
+    UChi_12+= U_21*Chi_11;\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
+    UChi_00+= U_00*Chi_02;\
+    UChi_10+= U_00*Chi_12;\
+    UChi_01+= U_10*Chi_02;\
+    UChi_11+= U_10*Chi_12;\
+    UChi_02+= U_20*Chi_02;\
+    UChi_12+= U_20*Chi_12;}
+
+
+#define PERMUTE_DIR(dir)			\
+      permute##dir(Chi_00,Chi_00);\
+      permute##dir(Chi_01,Chi_01);\
+      permute##dir(Chi_02,Chi_02);\
+      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_11,Chi_11);\
+      permute##dir(Chi_12,Chi_12);
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJ \
+    Chi_00 = Chimu_00+timesI(Chimu_30);\
+    Chi_01 = Chimu_01+timesI(Chimu_31);\
+    Chi_02 = Chimu_02+timesI(Chimu_32);\
+    Chi_10 = Chimu_10+timesI(Chimu_20);\
+    Chi_11 = Chimu_11+timesI(Chimu_21);\
+    Chi_12 = Chimu_12+timesI(Chimu_22);
+
+#define YP_PROJ \
+    Chi_00 = Chimu_00-Chimu_30;\
+    Chi_01 = Chimu_01-Chimu_31;\
+    Chi_02 = Chimu_02-Chimu_32;\
+    Chi_10 = Chimu_10+Chimu_20;\
+    Chi_11 = Chimu_11+Chimu_21;\
+    Chi_12 = Chimu_12+Chimu_22;
+
+#define ZP_PROJ \
+  Chi_00 = Chimu_00+timesI(Chimu_20);		\
+  Chi_01 = Chimu_01+timesI(Chimu_21);		\
+  Chi_02 = Chimu_02+timesI(Chimu_22);		\
+  Chi_10 = Chimu_10-timesI(Chimu_30);		\
+  Chi_11 = Chimu_11-timesI(Chimu_31);		\
+  Chi_12 = Chimu_12-timesI(Chimu_32);
+
+#define TP_PROJ \
+  Chi_00 = Chimu_00+Chimu_20;		\
+  Chi_01 = Chimu_01+Chimu_21;		\
+  Chi_02 = Chimu_02+Chimu_22;		\
+  Chi_10 = Chimu_10+Chimu_30;		\
+  Chi_11 = Chimu_11+Chimu_31;		\
+  Chi_12 = Chimu_12+Chimu_32;
+
+
+//      hspin(0)=fspin(0)-timesI(fspin(3));
+//      hspin(1)=fspin(1)-timesI(fspin(2));
+#define XM_PROJ \
+    Chi_00 = Chimu_00-timesI(Chimu_30);\
+    Chi_01 = Chimu_01-timesI(Chimu_31);\
+    Chi_02 = Chimu_02-timesI(Chimu_32);\
+    Chi_10 = Chimu_10-timesI(Chimu_20);\
+    Chi_11 = Chimu_11-timesI(Chimu_21);\
+    Chi_12 = Chimu_12-timesI(Chimu_22);
+
+#define YM_PROJ \
+    Chi_00 = Chimu_00+Chimu_30;\
+    Chi_01 = Chimu_01+Chimu_31;\
+    Chi_02 = Chimu_02+Chimu_32;\
+    Chi_10 = Chimu_10-Chimu_20;\
+    Chi_11 = Chimu_11-Chimu_21;\
+    Chi_12 = Chimu_12-Chimu_22;
+
+#define ZM_PROJ \
+  Chi_00 = Chimu_00-timesI(Chimu_20);		\
+  Chi_01 = Chimu_01-timesI(Chimu_21);		\
+  Chi_02 = Chimu_02-timesI(Chimu_22);		\
+  Chi_10 = Chimu_10+timesI(Chimu_30);		\
+  Chi_11 = Chimu_11+timesI(Chimu_31);		\
+  Chi_12 = Chimu_12+timesI(Chimu_32);
+
+#define TM_PROJ \
+  Chi_00 = Chimu_00-Chimu_20;		\
+  Chi_01 = Chimu_01-Chimu_21;		\
+  Chi_02 = Chimu_02-Chimu_22;		\
+  Chi_10 = Chimu_10-Chimu_30;		\
+  Chi_11 = Chimu_11-Chimu_31;		\
+  Chi_12 = Chimu_12-Chimu_32;
+
+//      fspin(0)=hspin(0);
+//      fspin(1)=hspin(1);
+//      fspin(2)=timesMinusI(hspin(1));
+//      fspin(3)=timesMinusI(hspin(0));
+#define XP_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesMinusI(UChi_10);\
+  result_21 = timesMinusI(UChi_11);\
+  result_22 = timesMinusI(UChi_12);\
+  result_30 = timesMinusI(UChi_00);\
+  result_31 = timesMinusI(UChi_01);\
+  result_32 = timesMinusI(UChi_02);
+
+#define XP_RECON_ACCUM\
+  result_00+=UChi_00;\
+  result_01+=UChi_01;\
+  result_02+=UChi_02;\
+  result_10+=UChi_10;\
+  result_11+=UChi_11;\
+  result_12+=UChi_12;\
+  result_20-=timesI(UChi_10);\
+  result_21-=timesI(UChi_11);\
+  result_22-=timesI(UChi_12);\
+  result_30-=timesI(UChi_00);\
+  result_31-=timesI(UChi_01);\
+  result_32-=timesI(UChi_02);
+
+#define XM_RECON\
+  result_00 = UChi_00;\
+  result_01 = UChi_01;\
+  result_02 = UChi_02;\
+  result_10 = UChi_10;\
+  result_11 = UChi_11;\
+  result_12 = UChi_12;\
+  result_20 = timesI(UChi_10);\
+  result_21 = timesI(UChi_11);\
+  result_22 = timesI(UChi_12);\
+  result_30 = timesI(UChi_00);\
+  result_31 = timesI(UChi_01);\
+  result_32 = timesI(UChi_02);
+
+#define XM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_10);\
+  result_21+= timesI(UChi_11);\
+  result_22+= timesI(UChi_12);\
+  result_30+= timesI(UChi_00);\
+  result_31+= timesI(UChi_01);\
+  result_32+= timesI(UChi_02);
+
+#define YP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_10;\
+  result_21+= UChi_11;\
+  result_22+= UChi_12;\
+  result_30-= UChi_00;\
+  result_31-= UChi_01;\
+  result_32-= UChi_02;
+
+#define YM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_10;\
+  result_21-= UChi_11;\
+  result_22-= UChi_12;\
+  result_30+= UChi_00;\
+  result_31+= UChi_01;\
+  result_32+= UChi_02;
+
+#define ZP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= timesI(UChi_00);			\
+  result_21-= timesI(UChi_01);			\
+  result_22-= timesI(UChi_02);			\
+  result_30+= timesI(UChi_10);			\
+  result_31+= timesI(UChi_11);			\
+  result_32+= timesI(UChi_12);
+
+#define ZM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= timesI(UChi_00);			\
+  result_21+= timesI(UChi_01);			\
+  result_22+= timesI(UChi_02);			\
+  result_30-= timesI(UChi_10);			\
+  result_31-= timesI(UChi_11);			\
+  result_32-= timesI(UChi_12);
+
+#define TP_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20+= UChi_00;			\
+  result_21+= UChi_01;			\
+  result_22+= UChi_02;			\
+  result_30+= UChi_10;			\
+  result_31+= UChi_11;			\
+  result_32+= UChi_12;
+
+#define TM_RECON_ACCUM\
+  result_00+= UChi_00;\
+  result_01+= UChi_01;\
+  result_02+= UChi_02;\
+  result_10+= UChi_10;\
+  result_11+= UChi_11;\
+  result_12+= UChi_12;\
+  result_20-= UChi_00;	\
+  result_21-= UChi_01;	\
+  result_22-= UChi_02;	\
+  result_30-= UChi_10;	\
+  result_31-= UChi_11;	\
+  result_32-= UChi_12;
+
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU;					\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else {					\
+    LOAD_CHI;					\
+  }						\
+  MULT_2SPIN(DIR);				\
+  RECON;					
+
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU;					\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else if ( st.same_node[DIR] ) {		\
+    LOAD_CHI;					\
+  }						\
+  if (local || st.same_node[DIR] ) {		\
+    MULT_2SPIN(DIR);				\
+    RECON;					\
+  }
+
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
+  SE=st.GetEntry(ptype,DIR,ss);			\
+  offset = SE->_offset;				\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+    LOAD_CHI;					\
+    MULT_2SPIN(DIR);				\
+    RECON;					\
+    nmu++;					\
+  }
+
+#define HAND_RESULT(ss)				\
+  {						\
+    SiteSpinor & ref (out[ss]);		\
+    vstream(ref()(0)(0),result_00);		\
+    vstream(ref()(0)(1),result_01);		\
+    vstream(ref()(0)(2),result_02);		\
+    vstream(ref()(1)(0),result_10);		\
+    vstream(ref()(1)(1),result_11);		\
+    vstream(ref()(1)(2),result_12);		\
+    vstream(ref()(2)(0),result_20);		\
+    vstream(ref()(2)(1),result_21);		\
+    vstream(ref()(2)(2),result_22);		\
+    vstream(ref()(3)(0),result_30);		\
+    vstream(ref()(3)(1),result_31);		\
+    vstream(ref()(3)(2),result_32);		\
+  }
+
+#define HAND_RESULT_EXT(ss)			\
+  if (nmu){					\
+    SiteSpinor & ref (out[ss]);		\
+    ref()(0)(0)+=result_00;		\
+    ref()(0)(1)+=result_01;		\
+    ref()(0)(2)+=result_02;		\
+    ref()(1)(0)+=result_10;		\
+    ref()(1)(1)+=result_11;		\
+    ref()(1)(2)+=result_12;		\
+    ref()(2)(0)+=result_20;		\
+    ref()(2)(1)+=result_21;		\
+    ref()(2)(2)+=result_22;		\
+    ref()(3)(0)+=result_30;		\
+    ref()(3)(1)+=result_31;		\
+    ref()(3)(2)+=result_32;		\
+  }
+
+
+#define HAND_DECLARATIONS(a)			\
+  Simd result_00;				\
+  Simd result_01;				\
+  Simd result_02;				\
+  Simd result_10;				\
+  Simd result_11;				\
+  Simd result_12;				\
+  Simd result_20;				\
+  Simd result_21;				\
+  Simd result_22;				\
+  Simd result_30;				\
+  Simd result_31;				\
+  Simd result_32;				\
+  Simd Chi_00;					\
+  Simd Chi_01;					\
+  Simd Chi_02;					\
+  Simd Chi_10;					\
+  Simd Chi_11;					\
+  Simd Chi_12;					\
+  Simd UChi_00;					\
+  Simd UChi_01;					\
+  Simd UChi_02;					\
+  Simd UChi_10;					\
+  Simd UChi_11;					\
+  Simd UChi_12;					\
+  Simd U_00;					\
+  Simd U_10;					\
+  Simd U_20;					\
+  Simd U_01;					\
+  Simd U_11;					\
+  Simd U_21;
+
+#define ZERO_RESULT				\
+  result_00=Zero();				\
+  result_01=Zero();				\
+  result_02=Zero();				\
+  result_10=Zero();				\
+  result_11=Zero();				\
+  result_12=Zero();				\
+  result_20=Zero();				\
+  result_21=Zero();				\
+  result_22=Zero();				\
+  result_30=Zero();				\
+  result_31=Zero();				\
+  result_32=Zero();			
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+NAMESPACE_BEGIN(Grid);
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  int nmu=0;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT_EXT(ss);
+}
+
+template<class Impl>
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  int nmu=0;
+  ZERO_RESULT;
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
+  HAND_RESULT_EXT(ss);
+}
+
+////////////// Wilson ; uses this implementation /////////////////////
+
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::HandDhopSite(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
+					     int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
+template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+						int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
+template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
+template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
+template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
+						int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
+template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
+
+INSTANTIATE_THEM(WilsonImplF);
+INSTANTIATE_THEM(WilsonImplD);
+INSTANTIATE_THEM(ZWilsonImplF);
+INSTANTIATE_THEM(ZWilsonImplD);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
+INSTANTIATE_THEM(DomainWallVec5dImplD);
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);
+INSTANTIATE_THEM(WilsonImplFH);
+INSTANTIATE_THEM(WilsonImplDF);
+INSTANTIATE_THEM(ZWilsonImplFH);
+INSTANTIATE_THEM(ZWilsonImplDF);
+INSTANTIATE_THEM(DomainWallVec5dImplFH);
+INSTANTIATE_THEM(DomainWallVec5dImplDF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
+INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
+INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
+
+NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
similarity index 98%
rename from lib/qcd/action/fermion/WilsonKernelsHand.cc
rename to Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
index 6ff78836..b3eab688 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsHandGparity.cc
@@ -931,20 +931,8 @@ template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st, DoubledGaugeFie
 template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
 						   int ss,int sU,const FermionFieldView &in, FermionFieldView &out); 
 
-INSTANTIATE_THEM(WilsonImplF);
-INSTANTIATE_THEM(WilsonImplD);
-INSTANTIATE_THEM(ZWilsonImplF);
-INSTANTIATE_THEM(ZWilsonImplD);
 INSTANTIATE_THEM(GparityWilsonImplF);
 INSTANTIATE_THEM(GparityWilsonImplD);
-INSTANTIATE_THEM(DomainWallVec5dImplF);
-INSTANTIATE_THEM(DomainWallVec5dImplD);
-INSTANTIATE_THEM(ZDomainWallVec5dImplF);
-INSTANTIATE_THEM(ZDomainWallVec5dImplD);
-INSTANTIATE_THEM(WilsonImplFH);
-INSTANTIATE_THEM(WilsonImplDF);
-INSTANTIATE_THEM(ZWilsonImplFH);
-INSTANTIATE_THEM(ZWilsonImplDF);
 INSTANTIATE_THEM(GparityWilsonImplFH);
 INSTANTIATE_THEM(GparityWilsonImplDF);
 INSTANTIATE_THEM(DomainWallVec5dImplFH);
diff --git a/lib/qcd/action/fermion/WilsonTMFermion.cc b/Grid/qcd/action/fermion/WilsonTMFermion.cc
similarity index 100%
rename from lib/qcd/action/fermion/WilsonTMFermion.cc
rename to Grid/qcd/action/fermion/WilsonTMFermion.cc
diff --git a/lib/qcd/action/fermion/WilsonTMFermion.h b/Grid/qcd/action/fermion/WilsonTMFermion.h
similarity index 96%
rename from lib/qcd/action/fermion/WilsonTMFermion.h
rename to Grid/qcd/action/fermion/WilsonTMFermion.h
index bb76aab1..12c4b71a 100644
--- a/lib/qcd/action/fermion/WilsonTMFermion.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion.h
@@ -25,8 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef  GRID_QCD_WILSON_TM_FERMION_H
-#define  GRID_QCD_WILSON_TM_FERMION_H
+#pragma once 
 
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonFermion.h>
@@ -72,4 +71,4 @@ private:
 
 NAMESPACE_END(Grid);
 
-#endif
+
diff --git a/Grid/qcd/action/fermion/WilsonTMFermion5D.h b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
new file mode 100644
index 00000000..634909d7
--- /dev/null
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -0,0 +1,152 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonTMFermion5D.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once 
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion.h>
+
+NAMESPACE_BEGIN(Grid);
+    
+template<class Impl>
+class WilsonTMFermion5D : public WilsonFermion5D<Impl>
+{
+ public:
+  INHERIT_IMPL_TYPES(Impl);
+ public:
+  
+  virtual void   Instantiatable(void) {};
+  
+  // Constructors
+ WilsonTMFermion5D(GaugeField &_Umu,
+		   GridCartesian         &Fgrid,
+		   GridRedBlackCartesian &Frbgrid, 
+		   GridCartesian         &Ugrid,
+		   GridRedBlackCartesian &Urbgrid, 
+		   const std::vector<RealD> _mass,
+		   const std::vector<RealD> _mu,
+		   const ImplParams &p= ImplParams()
+		   ) :
+  WilsonFermion5D<Impl>(_Umu,
+			Fgrid,
+			Frbgrid,
+			Ugrid,
+			Urbgrid,
+			4.0,p)
+   
+    {
+      update(_mass,_mu);
+    }
+  
+  virtual void Meooe(const FermionField &in, FermionField &out) {
+    if (in.checkerboard == Odd) {
+      this->DhopEO(in, out, DaggerNo);
+    } else {
+      this->DhopOE(in, out, DaggerNo);
+    }
+  }
+  
+  virtual void MeooeDag(const FermionField &in, FermionField &out) {
+    if (in.checkerboard == Odd) {
+      this->DhopEO(in, out, DaggerYes);
+    } else {
+      this->DhopOE(in, out, DaggerYes);
+    }
+  }	
+  
+  // allow override for twisted mass and clover
+  virtual void Mooee(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,this->mu[s]);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  
+  virtual void MooeeDag(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,-this->mu[s]);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  virtual void MooeeInv(const FermionField &in, FermionField &out) {
+    for (int s=0;s<(int)this->mass.size();s++) {
+      RealD m    = this->mass[s];
+      RealD tm   = this->mu[s];
+      RealD mtil = 4.0+this->mass[s];
+      RealD sq   = mtil*mtil+tm*tm;
+      ComplexD a    = mtil/sq;
+      ComplexD b(0.0, -tm /sq);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
+    for (int s=0;s<(int)this->mass.size();s++) {
+      RealD m    = this->mass[s];
+      RealD tm   = this->mu[s];
+      RealD mtil = 4.0+this->mass[s];
+      RealD sq   = mtil*mtil+tm*tm;
+      ComplexD a    = mtil/sq;
+      ComplexD b(0.0,tm /sq);
+      axpbg5y_ssp(out,a,in,b,in,s,s);
+    }
+  }
+  
+  virtual RealD M(const FermionField &in, FermionField &out) {
+    out.checkerboard = in.checkerboard;
+    this->Dhop(in, out, DaggerNo);
+    FermionField tmp(out._grid);
+    for (int s=0;s<(int)this->mass.size();s++) {
+      ComplexD a = 4.0+this->mass[s];
+      ComplexD b(0.0,this->mu[s]);
+      axpbg5y_ssp(tmp,a,in,b,in,s,s);
+    }
+    return axpy_norm(out, 1.0, tmp, out);
+  }
+  
+  // needed for fast PV
+  void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
+    assert(_mass.size() == _mu.size());
+    assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
+    this->mass = _mass;
+    this->mu = _mu;
+  }
+  
+ private:
+  std::vector<RealD> mu;
+  std::vector<RealD> mass;
+  
+};
+   
+typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF; 
+typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD; 
+
+NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h
similarity index 100%
rename from lib/qcd/action/fermion/ZMobiusFermion.h
rename to Grid/qcd/action/fermion/ZMobiusFermion.h
diff --git a/lib/qcd/action/fermion/g5HermitianLinop.h b/Grid/qcd/action/fermion/g5HermitianLinop.h
similarity index 100%
rename from lib/qcd/action/fermion/g5HermitianLinop.h
rename to Grid/qcd/action/fermion/g5HermitianLinop.h
diff --git a/lib/qcd/action/gauge/Gauge.h b/Grid/qcd/action/gauge/Gauge.h
similarity index 100%
rename from lib/qcd/action/gauge/Gauge.h
rename to Grid/qcd/action/gauge/Gauge.h
diff --git a/lib/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h
similarity index 100%
rename from lib/qcd/action/gauge/GaugeImplTypes.h
rename to Grid/qcd/action/gauge/GaugeImplTypes.h
diff --git a/lib/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h
similarity index 100%
rename from lib/qcd/action/gauge/GaugeImplementations.h
rename to Grid/qcd/action/gauge/GaugeImplementations.h
diff --git a/lib/qcd/action/gauge/Photon.h b/Grid/qcd/action/gauge/Photon.h
similarity index 66%
rename from lib/qcd/action/gauge/Photon.h
rename to Grid/qcd/action/gauge/Photon.h
index 5d508136..c0d449db 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/Grid/qcd/action/gauge/Photon.h
@@ -58,9 +58,12 @@ class Photon
 public:
   INHERIT_GIMPL_TYPES(Gimpl);
   GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
-  GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
+    GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
 public:
   Photon(Gauge gauge, ZmScheme zmScheme);
+    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
+    Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
+    Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
   virtual ~Photon(void) = default;
   void FreePropagator(const GaugeField &in, GaugeField &out);
   void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
@@ -68,19 +71,42 @@ public:
   void StochasticField(GaugeField &out, GridParallelRNG &rng);
   void StochasticField(GaugeField &out, GridParallelRNG &rng,
 		       const GaugeLinkField &weight);
+    void UnitField(GaugeField &out);
 private:
+    void infVolPropagator(GaugeLinkField &out);
   void invKHatSquared(GaugeLinkField &out);
   void zmSub(GaugeLinkField &out);
 private:
   Gauge    gauge_;
   ZmScheme zmScheme_;
+    std::vector<Real>  improvement_;
+    Real     G0_;
 };
 
 typedef Photon<QedGimplR>  PhotonR;
   
 template<class Gimpl>
 Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
-  : gauge_(gauge), zmScheme_(zmScheme)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
+    G0_(0.15493339023106021408483720810737508876916113364521)
+  {}
+
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+                        std::vector<Real> improvements)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
+    G0_(0.15493339023106021408483720810737508876916113364521)
+  {}
+
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
+  {}
+
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
+                        std::vector<Real> improvements, Real G0)
+  : gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
 {}
   
 template<class Gimpl>
@@ -97,6 +123,34 @@ void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
 }
   
 template<class Gimpl>
+  void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out.Grid());
+    LatticeReal        xmu(grid);
+    GaugeLinkField     one(grid);
+    const unsigned int nd    = grid->_ndimension;
+    Coordinate   &l    = grid->_fdimensions;
+    Coordinate   x0(nd,0);
+    TComplex           Tone  = Complex(1.0,0.0);
+    TComplex           Tzero = Complex(G0_,0.0);
+    FFT                fft(grid);
+    
+    one = Complex(1.0,0.0);
+    out = Zero();
+    for(int mu = 0; mu < nd; mu++)
+    {
+      LatticeCoordinate(xmu,mu);
+      Real lo2 = l[mu]/2.0;
+      xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
+      out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
+    }
+    pokeSite(Tone, out, x0);
+    out = one/out;
+    pokeSite(Tzero, out, x0);
+    fft.FFT_all_dim(out, out, FFT::forward);
+  }
+  
+  template<class Gimpl>
 void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
 {
   GridBase           *grid = out.Grid();
@@ -127,6 +181,7 @@ void Photon<Gimpl>::zmSub(GaugeLinkField &out)
 {
   GridBase           *grid = out.Grid();
   const unsigned int nd    = grid->_ndimension;
+    Coordinate   &l    = grid->_fdimensions;
     
   switch (zmScheme_)
     {
@@ -148,11 +203,17 @@ void Photon<Gimpl>::zmSub(GaugeLinkField &out)
         for(int d = 0; d < grid->_ndimension - 1; d++)
 	  {
 	    LatticeCoordinate(coor,d);
+          coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
 	    spNrm = spNrm + coor*coor;
 	  }
         out = where(spNrm == Integer(0), 0.*out, out);
         
-        break;
+        // IR improvement
+        for(int i = 0; i < improvement_.size(); i++)
+        {
+          Real f = sqrt(improvement_[i]+1);
+          out = where(spNrm == Integer(i+1), f*out, out);
+        }
       }
     default:
       break;
@@ -164,12 +225,27 @@ void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
 					    GaugeField &out)
 {
   GridBase           *grid = out.Grid();
-  LatticeComplex     k2Inv(grid);
+  LatticeComplex     momProp(grid);
+
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      case ZmScheme::qedL:
+      {
+        invKHatSquared(momProp);
+        zmSub(momProp);
+        break;
+      }
+      case ZmScheme::qedInf:
+      {
+        infVolPropagator(momProp);
+        break;
+      }
+      default:
+        break;
+    }
     
-  invKHatSquared(k2Inv);
-  zmSub(k2Inv);
-    
-  out = in*k2Inv;
+    out = in*momProp;
 }
   
 template<class Gimpl>
@@ -179,14 +255,30 @@ void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
   const unsigned int nd        = grid->_ndimension;
   Coordinate   latt_size = grid->_fdimensions;
     
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      case ZmScheme::qedL:
+      {
   Integer vol = 1;
   for(int d = 0; d < nd; d++)
     {
       vol = vol * latt_size[d];
     }
   invKHatSquared(weight);
-  weight = sqrt(vol*real(weight));
+        weight = sqrt(vol)*sqrt(weight);
   zmSub(weight);
+        break;
+      }
+      case ZmScheme::qedInf:
+      {
+        infVolPropagator(weight);
+        weight = sqrt(real(weight));
+        break;
+      }
+      default:
+        break;
+    }
 }
   
 template<class Gimpl>
@@ -209,16 +301,55 @@ void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
   GaugeField         aTilde(grid);
   FFT                fft(grid);
     
+    switch (zmScheme_)
+    {
+      case ZmScheme::qedTL:
+      case ZmScheme::qedL:
+      {
   for(int mu = 0; mu < nd; mu++)
     {
       gaussian(rng, r);
       r = weight*r;
       pokeLorentz(aTilde, r, mu);
     }
+        break;
+      }
+      case ZmScheme::qedInf:
+      {
+        Complex                    shift(1., 1.); // This needs to be a GaugeLink element?
+        for(int mu = 0; mu < nd; mu++)
+        {
+          bernoulli(rng, r);
+          r = weight*(2.*r - shift);
+          pokeLorentz(aTilde, r, mu);
+        }
+        break;
+      }
+      default:
+        break;
+    }
+
   fft.FFT_all_dim(out, aTilde, FFT::backward);
     
   out = real(out);
 }
+
+  template<class Gimpl>
+  void Photon<Gimpl>::UnitField(GaugeField &out)
+  {
+    auto               *grid = dynamic_cast<GridCartesian *>(out.Grid());
+    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     r(grid);
+    
+    r = Complex(1.0,0.0);
+
+    for(int mu = 0; mu < nd; mu++)
+    {
+      pokeLorentz(out, r, mu);
+    }
+    
+    out = real(out);
+  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
 //                                                            const GaugeField &in)
diff --git a/lib/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
similarity index 100%
rename from lib/qcd/action/gauge/PlaqPlusRectangleAction.h
rename to Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
diff --git a/lib/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h
similarity index 88%
rename from lib/qcd/action/gauge/WilsonGaugeAction.h
rename to Grid/qcd/action/gauge/WilsonGaugeAction.h
index e2b4e5fc..40d600d2 100644
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -70,27 +70,22 @@ public:
 
     RealD factor = 0.5 * beta / RealD(Nc);
 
-    //GaugeLinkField Umu(U.Grid());
+    GaugeLinkField Umu(U.Grid());
     GaugeLinkField dSdU_mu(U.Grid());
     for (int mu = 0; mu < Nd; mu++) {
 
-      GaugeLinkField Umu(U.Grid());
-      GaugeLinkField dSdU_mu(U.Grid());
-      for (int mu = 0; mu < Nd; mu++) {
-	Umu = PeekIndex<LorentzIndex>(U, mu);
- 
-	// Staple in direction mu
-	WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-	dSdU_mu = Ta(Umu * dSdU_mu) * factor;
- 
-	PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
-
-      }
+      Umu = PeekIndex<LorentzIndex>(U, mu);
+      
+      // Staple in direction mu
+      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+      
+      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
     }
   }
 private:
   RealD beta;  
-};
+ };
 
 NAMESPACE_END(Grid);
 #endif
diff --git a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h b/Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
rename to Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
diff --git a/lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
rename to Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
diff --git a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
rename to Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
diff --git a/lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
rename to Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
diff --git a/lib/qcd/action/pseudofermion/OneFlavourRational.h b/Grid/qcd/action/pseudofermion/OneFlavourRational.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/OneFlavourRational.h
rename to Grid/qcd/action/pseudofermion/OneFlavourRational.h
diff --git a/lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/OneFlavourRationalRatio.h
rename to Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
diff --git a/lib/qcd/action/pseudofermion/PseudoFermion.h b/Grid/qcd/action/pseudofermion/PseudoFermion.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/PseudoFermion.h
rename to Grid/qcd/action/pseudofermion/PseudoFermion.h
diff --git a/lib/qcd/action/pseudofermion/TwoFlavour.h b/Grid/qcd/action/pseudofermion/TwoFlavour.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/TwoFlavour.h
rename to Grid/qcd/action/pseudofermion/TwoFlavour.h
diff --git a/lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
rename to Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
diff --git a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
rename to Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
diff --git a/lib/qcd/action/pseudofermion/TwoFlavourRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
similarity index 100%
rename from lib/qcd/action/pseudofermion/TwoFlavourRatio.h
rename to Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
diff --git a/lib/qcd/action/scalar/Scalar.h b/Grid/qcd/action/scalar/Scalar.h
similarity index 95%
rename from lib/qcd/action/scalar/Scalar.h
rename to Grid/qcd/action/scalar/Scalar.h
index ee52ea13..44f7c450 100644
--- a/lib/qcd/action/scalar/Scalar.h
+++ b/Grid/qcd/action/scalar/Scalar.h
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-#ifndef GRID_QCD_SCALAR_H
-#define GRID_QCD_SCALAR_H
+
+#pragma once
 
 #include <Grid/qcd/action/scalar/ScalarImpl.h>
 #include <Grid/qcd/action/scalar/ScalarAction.h>
@@ -45,4 +45,4 @@ template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractio
   
 NAMESPACE_END(Grid);
 
-#endif  // GRID_QCD_SCALAR_H
+
diff --git a/lib/qcd/action/scalar/ScalarAction.h b/Grid/qcd/action/scalar/ScalarAction.h
similarity index 100%
rename from lib/qcd/action/scalar/ScalarAction.h
rename to Grid/qcd/action/scalar/ScalarAction.h
diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h
similarity index 57%
rename from lib/qcd/action/scalar/ScalarImpl.h
rename to Grid/qcd/action/scalar/ScalarImpl.h
index 3a175346..febb315e 100644
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/Grid/qcd/action/scalar/ScalarImpl.h
@@ -1,5 +1,4 @@
-#ifndef SCALAR_IMPL
-#define SCALAR_IMPL
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -87,6 +86,12 @@ public:
     
 };
 
+  #ifdef  USE_FFT_ACCELERATION
+  #ifndef FFT_MASS
+  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
+  #endif
+  #endif
+  
 template <class S, unsigned int N>
 class ScalarAdjMatrixImplTypes {
 public:
@@ -107,18 +112,113 @@ public:
   typedef Field                FermionField;
   typedef Field                PropagatorField;
 
-  static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+    static void MomentaSquare(ComplexField &out)
+    {
+      GridBase *grid = out.Grid();
+      const Coordinate &l = grid->FullDimensions();
+      ComplexField kmu(grid);
+
+      for (int mu = 0; mu < grid->Nd(); mu++)
+      {
+        Real twoPiL = M_PI * 2.0 / l[mu];
+        LatticeCoordinate(kmu, mu);
+        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
+        out += kmu * kmu;
+      }
+    }
+
+    static void MomentumSpacePropagator(ComplexField &out, RealD m)
+    {
+      GridBase *grid = out.Grid();
+      ComplexField one(grid);
+      one = Complex(1.0, 0.0);
+      out = m * m;
+      MomentaSquare(out);
+      out = one / out;
+    }
+
+    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    {
+#ifndef USE_FFT_ACCELERATION
     Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+#else
+
+      Field Pgaussian(P.Grid()), Pp(P.Grid());
+      ComplexField p2(P.Grid()); p2 = zero;
+      RealD M = FFT_MASS;
+      
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
+
+      FFT theFFT((GridCartesian*)P.Grid());
+      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
+      MomentaSquare(p2);
+      p2 += M * M;
+      p2 = sqrt(p2);
+      Pp *= p2;
+      theFFT.FFT_all_dim(P, Pp, FFT::backward);
+
+#endif //USE_FFT_ACCELERATION
   }
 
   static inline Field projectForce(Field& P) {return P;}
 
-  static inline void update_field(Field& P, Field& U, double ep) {
+    static inline void update_field(Field &P, Field &U, double ep)
+    {
+#ifndef USE_FFT_ACCELERATION
+      double t0=usecond(); 
     U += P*ep;
-  }
+      double t1=usecond();
+      double total_time = (t1-t0)/1e6;
+      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
+#else
+      // FFT transform P(x) -> P(p)
+      // divide by (M^2+p^2)  M external parameter (how to pass?)
+      // P'(p) = P(p)/(M^2+p^2)
+      // Transform back -> P'(x)
+      // U += P'(x)*ep
 
-  static inline RealD FieldSquareNorm(Field& U) {
+      Field Pp(U.Grid()), P_FFT(U.Grid());     
+      static ComplexField p2(U.Grid());
+      RealD M = FFT_MASS;
+      
+      FFT theFFT((GridCartesian*)U.Grid());
+      theFFT.FFT_all_dim(Pp, P, FFT::forward);
+
+      static bool first_call = true;
+      if (first_call)
+      {
+        // avoid recomputing
+        MomentumSpacePropagator(p2, M);
+        first_call = false;
+  }
+      Pp *= p2;
+      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
+      U += P_FFT * ep;
+
+#endif //USE_FFT_ACCELERATION
+    }
+
+    static inline RealD FieldSquareNorm(Field &U)
+    {
+#ifndef USE_FFT_ACCELERATION
     return (TensorRemove(sum(trace(U*U))).real());
+#else
+      // In case of Fourier acceleration we have to:
+      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
+      // 1 FFT needed U(x) -> U(p)
+      // M to be passed
+
+      FFT theFFT((GridCartesian*)U.Grid());
+      Field Up(U.Grid());
+
+      theFFT.FFT_all_dim(Up, U, FFT::forward);
+      RealD M = FFT_MASS;
+      ComplexField p2(U.Grid());
+      MomentumSpacePropagator(p2, M);
+      Field Up2 = Up * p2;
+      // from the definition of the DFT we need to divide by the volume
+      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U.Grid()->gSites());
+#endif //USE_FFT_ACCELERATION
   }
 
   static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -135,9 +235,6 @@ public:
 
 };
 
-
-
-
 typedef ScalarImplTypes<vReal> ScalarImplR;
 typedef ScalarImplTypes<vRealF> ScalarImplF;
 typedef ScalarImplTypes<vRealD> ScalarImplD;
@@ -156,4 +253,3 @@ template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComp
 
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h
new file mode 100644
index 00000000..fbd84398
--- /dev/null
+++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h
@@ -0,0 +1,205 @@
+/*************************************************************************************
+
+  Grid physics library, www.github.com/paboyle/Grid
+
+  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
+
+  Copyright (C) 2015
+
+  Author: Guido Cossu <guido,cossu@ed.ac.uk>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+  See the full license in the file "LICENSE" in the top level distribution
+directory
+  *************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+// Note: this action can completely absorb the ScalarAction for real float fields
+// use the scalarObjs to generalise the structure
+
+NAMESPACE_BEGIN(Grid);
+
+template <class Impl, int Ndim>
+class ScalarInteractionAction : public Action<typename Impl::Field>
+{
+public:
+  INHERIT_FIELD_TYPES(Impl);
+
+private:
+  RealD mass_square;
+  RealD lambda;
+  RealD g;
+  const unsigned int N = Impl::Group::Dimension;
+
+  typedef typename Field::vector_object vobj;
+  typedef CartesianStencil<vobj, vobj> Stencil;
+
+  SimpleCompressor<vobj> compressor;
+  int npoint = 2 * Ndim;
+  std::vector<int> directions;    //
+  std::vector<int> displacements; //
+
+public:
+  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
+  {
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      directions[mu] = mu;
+      directions[mu + Ndim] = mu;
+      displacements[mu] = 1;
+      displacements[mu + Ndim] = -1;
+    }
+  }
+
+  virtual std::string LogParameters()
+  {
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
+    return sstream.str();
+  }
+
+  virtual std::string action_name() { return "ScalarAction"; }
+
+  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+
+  virtual RealD S(const Field &p)
+  {
+    assert(p._grid->Nd() == Ndim);
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    phiStencil.HaloExchange(p, compressor);
+    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+    phisquared = p * p;
+    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      {
+        int permute_type;
+        StencilEntry *SE;
+        vobj temp2;
+        const vobj *temp, *t_p;
+
+        SE = phiStencil.GetEntry(permute_type, mu, i);
+        t_p = &p._odata[i];
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+          if (SE->_permute)
+          {
+            permute(temp2, *temp, permute_type);
+            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
+          }
+          else
+          {
+            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+          }
+        }
+        else
+        {
+          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+      //  action -= pshift*p + p*pshift;
+    }
+    // NB the trace in the algebra is normalised to 1/2
+    // minus sign coming from the antihermitian fields
+    return -(TensorRemove(sum(trace(action)))).real() * N / g;
+  };
+
+  virtual void deriv(const Field &p, Field &force)
+  {
+    double t0 = usecond();
+    assert(p._grid->Nd() == Ndim);
+    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
+    double interm_t = usecond();
+
+    // move this outside
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+
+    phiStencil.HaloExchange(p, compressor);
+    double halo_t = usecond();
+    int chunk = 128;
+    //for (int mu = 0; mu < Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+    // inverting the order of the loops slows down the code(! g++ 7)
+    // cannot try to reduce the number of  force writes by factor npoint...
+    // use cache blocking
+    for (int point = 0; point < npoint; point++)
+    {
+
+#pragma omp parallel 
+{
+        int permute_type;
+        StencilEntry *SE;
+        const vobj *temp;
+
+#pragma omp for schedule(static, chunk)
+      for (int i = 0; i < p._grid->oSites(); i++)
+      {
+        SE = phiStencil.GetEntry(permute_type, point, i);
+        // prefetch next p?
+
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+      
+          if (SE->_permute)
+          {
+            vobj temp2;
+            permute(temp2, *temp, permute_type);
+            force._odata[i] -= temp2;
+          }
+          else
+          {
+            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
+          }
+        }
+        else
+        {
+          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+
+    }
+  }
+  force *= N / g;
+
+  double t1 = usecond();
+  double total_time = (t1 - t0) / 1e6;
+  double interm_time = (interm_t - t0) / 1e6;
+  double halo_time = (halo_t - interm_t) / 1e6;
+  double stencil_time = (t1 - halo_t) / 1e6;
+  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
+  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
+  double Gflops = flops / (total_time * 1e9);
+  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+}
+};
+
+NAMESPACE_END(Grid);
+
+
diff --git a/lib/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h
similarity index 99%
rename from lib/qcd/hmc/GenericHMCrunner.h
rename to Grid/qcd/hmc/GenericHMCrunner.h
index b7937a49..c2443dd0 100644
--- a/lib/qcd/hmc/GenericHMCrunner.h
+++ b/Grid/qcd/hmc/GenericHMCrunner.h
@@ -208,7 +208,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
 ScalarAdjGenericHMCRunner;
 
 template <int Colours> 
-using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;
 
 NAMESPACE_END(Grid);
 
diff --git a/lib/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h
similarity index 99%
rename from lib/qcd/hmc/HMC.h
rename to Grid/qcd/hmc/HMC.h
index 839e2dea..0f933204 100644
--- a/lib/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -36,8 +36,7 @@ directory
 			    * @author Guido Cossu
 			    */
 			   //--------------------------------------------------------------------
-#ifndef HMC_INCLUDED
-#define HMC_INCLUDED
+#pragma once
 
 #include <string>
 #include <list>
@@ -236,10 +235,8 @@ public:
 };
 
 NAMESPACE_END(Grid);
-
 // april 11 2017 merge, Guido, commenting out
 //#include <Grid/parallelIO/NerscIO.h>
 //#include <Grid/qcd/hmc/NerscCheckpointer.h>
 //#include <Grid/qcd/hmc/HmcRunner.h>
 
-#endif
diff --git a/lib/qcd/hmc/HMCModules.h b/Grid/qcd/hmc/HMCModules.h
similarity index 100%
rename from lib/qcd/hmc/HMCModules.h
rename to Grid/qcd/hmc/HMCModules.h
diff --git a/lib/qcd/hmc/HMCResourceManager.h b/Grid/qcd/hmc/HMCResourceManager.h
similarity index 80%
rename from lib/qcd/hmc/HMCResourceManager.h
rename to Grid/qcd/hmc/HMCResourceManager.h
index f4f5f8a1..783e4890 100644
--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/Grid/qcd/hmc/HMCResourceManager.h
@@ -32,21 +32,37 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 
 #include <unordered_map>
 
-			   // One function per Checkpointer, use a macro to simplify
-#define RegisterLoadCheckPointerFunction(NAME)				\
-   void Load##NAME##Checkpointer(const CheckpointerParameters& Params_) { \
-     if (!have_CheckPointer) {						\
-       std::cout << GridLogDebug << "Loading Checkpointer " << #NAME	\
-		 << std::endl;						\
-       CP = std::unique_ptr<CheckpointerBaseModule>(			\
-						    new NAME##CPModule<ImplementationPolicy>(Params_));	\
-       have_CheckPointer = true;					\
-     } else {								\
-       std::cout << GridLogError << "Checkpointer already loaded "	\
-		 << std::endl;						\
-       exit(1);								\
-     }									\
-   }
+// One function per Checkpointer, use a macro to simplify
+#define RegisterLoadCheckPointerFunction(NAME)                           \
+  void Load##NAME##Checkpointer(const CheckpointerParameters& Params_) { \
+    if (!have_CheckPointer) {                                            \
+      std::cout << GridLogDebug << "Loading Checkpointer " << #NAME      \
+                << std::endl;                                            \
+      CP = std::unique_ptr<CheckpointerBaseModule>(                      \
+        new NAME##CPModule<ImplementationPolicy>(Params_));              \
+      have_CheckPointer = true;                                          \
+    } else {                                                             \
+      std::cout << GridLogError << "Checkpointer already loaded "        \
+                << std::endl;                                            \
+      exit(1);                                                           \
+    }                                                                    \
+  }
+
+#define RegisterLoadCheckPointerMetadataFunction(NAME)                   \
+  template < class Metadata >                                            \
+  void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
+    if (!have_CheckPointer) {                                            \
+      std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME      \
+                << std::endl;                                            \
+      CP = std::unique_ptr<CheckpointerBaseModule>(                      \
+        new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_));   \
+      have_CheckPointer = true;                                          \
+    } else {                                                             \
+      std::cout << GridLogError << "Checkpointer already loaded "        \
+                << std::endl;                                            \
+      exit(1);                                                           \
+    }                                                                    \
+  }
 
 NAMESPACE_BEGIN(Grid);
 
@@ -76,7 +92,7 @@ class HMCResourceManager {
   bool have_CheckPointer;
 
   // NOTE: operator << is not overloaded for std::vector<string> 
-  // so thsi function is necessary
+  // so this function is necessary
   void output_vector_string(const std::vector<std::string> &vs){
     for (auto &i: vs)
       std::cout << i << " ";
@@ -253,6 +269,7 @@ public:
   RegisterLoadCheckPointerFunction(Nersc);
 #ifdef HAVE_LIME
   RegisterLoadCheckPointerFunction(ILDG);
+  RegisterLoadCheckPointerMetadataFunction(Scidac);
 #endif
 
   ////////////////////////////////////////////////////////
diff --git a/lib/qcd/hmc/HMCRunnerModule.h b/Grid/qcd/hmc/HMCRunnerModule.h
similarity index 100%
rename from lib/qcd/hmc/HMCRunnerModule.h
rename to Grid/qcd/hmc/HMCRunnerModule.h
diff --git a/lib/qcd/hmc/HMC_GridModules.h b/Grid/qcd/hmc/HMC_GridModules.h
similarity index 100%
rename from lib/qcd/hmc/HMC_GridModules.h
rename to Grid/qcd/hmc/HMC_GridModules.h
diff --git a/lib/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h
similarity index 93%
rename from lib/qcd/hmc/HMC_aggregate.h
rename to Grid/qcd/hmc/HMC_aggregate.h
index 53d4b9c8..e4d2ce83 100644
--- a/lib/qcd/hmc/HMC_aggregate.h
+++ b/Grid/qcd/hmc/HMC_aggregate.h
@@ -28,25 +28,26 @@ directory
 /*  END LEGAL */
 //--------------------------------------------------------------------
 //--------------------------------------------------------------------
-#ifndef HMC_AGGREGATE_INCLUDED
-#define HMC_AGGREGATE_INCLUDED
+#pragma once
 
 #include <string>
 
 #include <Grid/qcd/observables/hmc_observable.h>
 #include <Grid/qcd/hmc/HMC.h>
 
-
 // annoying location; should move this ?
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/parallelIO/IldgIO.h>
 #include <Grid/parallelIO/NerscIO.h>
+NAMESPACE_CHECK(Ildg);
 
 #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>
 #include <Grid/qcd/hmc/HMCModules.h>
 #include <Grid/qcd/modules/mods.h>
+NAMESPACE_CHECK(HMCmodules);
 #include <Grid/qcd/hmc/HMCResourceManager.h>
+NAMESPACE_CHECK(HMCresourcemanager);
 #include <Grid/qcd/hmc/GenericHMCrunner.h>
 #include <Grid/qcd/hmc/HMCRunnerModule.h>
+NAMESPACE_CHECK(HMCrunner);
 
-#endif
diff --git a/lib/qcd/hmc/UsingHMC.md b/Grid/qcd/hmc/UsingHMC.md
similarity index 100%
rename from lib/qcd/hmc/UsingHMC.md
rename to Grid/qcd/hmc/UsingHMC.md
diff --git a/lib/qcd/hmc/checkpointers/BaseCheckpointer.h b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
similarity index 92%
rename from lib/qcd/hmc/checkpointers/BaseCheckpointer.h
rename to Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
index 391d2d6f..3cd05ebc 100644
--- a/lib/qcd/hmc/checkpointers/BaseCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h
@@ -75,6 +75,14 @@ public:
     }
   } 
 
+  void check_filename(const std::string &filename){
+    std::ifstream f(filename.c_str());
+    if(!f.good()){
+      std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl;
+      abort();
+    };
+  }
+
   virtual void initialize(const CheckpointerParameters &Params) = 0;
 
   virtual void CheckpointRestore(int traj, typename Impl::Field &U,
diff --git a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
similarity index 98%
rename from lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
rename to Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
index 4f84e94a..ef9e6194 100644
--- a/lib/qcd/hmc/checkpointers/BinaryCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h
@@ -92,6 +92,9 @@ public:
   void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+
 
     BinarySimpleMunger<sobj_double, sobj> munge;
 
diff --git a/lib/qcd/hmc/checkpointers/CheckPointerModules.h b/Grid/qcd/hmc/checkpointers/CheckPointerModules.h
similarity index 88%
rename from lib/qcd/hmc/checkpointers/CheckPointerModules.h
rename to Grid/qcd/hmc/checkpointers/CheckPointerModules.h
index d920be42..13b6cf13 100644
--- a/lib/qcd/hmc/checkpointers/CheckPointerModules.h
+++ b/Grid/qcd/hmc/checkpointers/CheckPointerModules.h
@@ -127,6 +127,20 @@ class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> {
 
 };
 
+template<class ImplementationPolicy, class Metadata>
+class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> {
+  typedef CheckPointerModule< ImplementationPolicy> CPBase;
+  Metadata M;
+
+  // acquire resource
+  virtual void initialize(){
+     this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M));
+  }
+public:
+  ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {}
+  template <class ReaderClass>
+  ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){};
+};
 #endif
 
 extern char cp_string[];
diff --git a/lib/qcd/hmc/checkpointers/CheckPointers.h b/Grid/qcd/hmc/checkpointers/CheckPointers.h
similarity index 96%
rename from lib/qcd/hmc/checkpointers/CheckPointers.h
rename to Grid/qcd/hmc/checkpointers/CheckPointers.h
index 423ce45c..e7a5fa82 100644
--- a/lib/qcd/hmc/checkpointers/CheckPointers.h
+++ b/Grid/qcd/hmc/checkpointers/CheckPointers.h
@@ -34,6 +34,7 @@ directory
 #include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
 #include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
+#include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h>
 //#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>
 
 
diff --git a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
similarity index 96%
rename from lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
rename to Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
index 9478be64..258fcea5 100644
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -73,10 +73,10 @@ public:
     if ((traj % Params.saveInterval) == 0) {
       std::string config, rng;
       this->build_filenames(traj, Params, config, rng);
-      
+      GridBase *grid = U._grid;
       uint32_t nersc_csum,scidac_csuma,scidac_csumb;
       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
-      IldgWriter _IldgWriter;
+      IldgWriter _IldgWriter(grid->IsBoss());
       _IldgWriter.open(config);
       _IldgWriter.writeConfiguration(U, traj, config, config);
       _IldgWriter.close();
@@ -94,6 +94,10 @@ public:
                          GridParallelRNG &pRNG) {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+
+    
 
     uint32_t nersc_csum,scidac_csuma,scidac_csumb;
     BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
diff --git a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
similarity index 97%
rename from lib/qcd/hmc/checkpointers/NerscCheckpointer.h
rename to Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
index 6004e5e8..cfcc44d8 100644
--- a/lib/qcd/hmc/checkpointers/NerscCheckpointer.h
+++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h
@@ -68,6 +68,9 @@ public:
                          GridParallelRNG &pRNG) {
     std::string config, rng;
     this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+
 
     FieldMetaData header;
     NerscIO::readRNGState(sRNG, pRNG, header, rng);
diff --git a/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
new file mode 100644
index 00000000..18daf1eb
--- /dev/null
+++ b/Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h
@@ -0,0 +1,119 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
+
+Copyright (C) 2018
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#ifdef HAVE_LIME
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+NAMESPACE_BEGIN(Grid);
+// For generic fields
+template <class Implementation, class Metadata>
+class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
+ private:
+  CheckpointerParameters Params;
+  Metadata MData;
+
+  typedef typename Implementation::Field Field;
+
+ public:
+  //INHERIT_GIMPL_TYPES(Implementation);
+
+  ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
+  ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
+
+  void initialize(const CheckpointerParameters &Params_) {
+    Params = Params_;
+
+    // check here that the format is valid
+    int ieee32big = (Params.format == std::string("IEEE32BIG"));
+    int ieee32    = (Params.format == std::string("IEEE32"));
+    int ieee64big = (Params.format == std::string("IEEE64BIG"));
+    int ieee64    = (Params.format == std::string("IEEE64"));
+
+    if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
+      std::cout << GridLogError << "Unrecognized file format " << Params.format
+                << std::endl;
+      std::cout << GridLogError
+                << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
+                << std::endl;
+
+      exit(1);
+    }
+  }
+
+  void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+    if ((traj % Params.saveInterval) == 0) {
+      std::string config, rng;
+      this->build_filenames(traj, Params, config, rng);
+      GridBase *grid = U._grid;
+      uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+      BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+      ScidacWriter _ScidacWriter(grid->IsBoss());
+      _ScidacWriter.open(config);
+      _ScidacWriter.writeScidacFieldRecord(U, MData);
+      _ScidacWriter.close();
+
+      std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
+    }
+  };
+
+  void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
+                         GridParallelRNG &pRNG) {
+    std::string config, rng;
+    this->build_filenames(traj, Params, config, rng);
+    this->check_filename(rng);
+    this->check_filename(config);
+
+
+    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
+    BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
+
+    Metadata md_content;
+    ScidacReader _ScidacReader;
+    _ScidacReader.open(config);
+    _ScidacReader.readScidacFieldRecord(U,md_content);  // format from the header
+    _ScidacReader.close();
+
+    std::cout << GridLogMessage << "Read Scidac Configuration from " << config
+              << " checksum " << std::hex 
+	      << nersc_csum<<"/"
+	      << scidac_csuma<<"/"
+	      << scidac_csumb
+	      << std::dec << std::endl;
+  };
+};
+NAMESPACE_END(Grid);
+
+
+#endif  // HAVE_LIME
+
diff --git a/lib/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h
similarity index 95%
rename from lib/qcd/hmc/integrators/Integrator.h
rename to Grid/qcd/hmc/integrators/Integrator.h
index 5d7bcab9..b795d75a 100644
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -114,17 +114,25 @@ protected:
     // Fundamental updates, include smearing
 
     for (int a = 0; a < as[level].actions.size(); ++a) {
+      double start_full = usecond();
       Field force(U.Grid());
       conformable(U.Grid(), Mom.Grid());
+
       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
+      double start_force = usecond();
       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
 
       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
       force = FieldImplementation::projectForce(force); // Ta for gauge fields
+      double end_force = usecond();
       Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites());
-      std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
       Mom -= force * ep; 
+      double end_full = usecond();
+      double time_full  = (end_full - start_full) / 1e3;
+      double time_force = (end_force - start_force) / 1e3;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
     }
 
     // Force from the other representations
diff --git a/lib/qcd/hmc/integrators/Integrator_algorithm.h b/Grid/qcd/hmc/integrators/Integrator_algorithm.h
similarity index 100%
rename from lib/qcd/hmc/integrators/Integrator_algorithm.h
rename to Grid/qcd/hmc/integrators/Integrator_algorithm.h
diff --git a/lib/qcd/modules/ActionModules.h b/Grid/qcd/modules/ActionModules.h
similarity index 100%
rename from lib/qcd/modules/ActionModules.h
rename to Grid/qcd/modules/ActionModules.h
diff --git a/lib/qcd/modules/Factory.h b/Grid/qcd/modules/Factory.h
similarity index 98%
rename from lib/qcd/modules/Factory.h
rename to Grid/qcd/modules/Factory.h
index 66bd627e..fafa2038 100644
--- a/lib/qcd/modules/Factory.h
+++ b/Grid/qcd/modules/Factory.h
@@ -95,7 +95,7 @@ std::unique_ptr<T> Factory<T, CreatorInput>::create(const std::string type,
     }
     catch (std::out_of_range &)
     {
-      //HADRON_ERROR("object of type '" + type + "' unknown");
+      //HADRONS_ERROR("object of type '" + type + "' unknown");
     	std::cout << GridLogError << "Error" << std::endl;
     	std::cout << GridLogError << obj_type() << " object of name [" << type << "] unknown" << std::endl;
     	exit(1);
diff --git a/lib/qcd/modules/FermionOperatorModules.h b/Grid/qcd/modules/FermionOperatorModules.h
similarity index 100%
rename from lib/qcd/modules/FermionOperatorModules.h
rename to Grid/qcd/modules/FermionOperatorModules.h
diff --git a/lib/qcd/modules/Modules.cc b/Grid/qcd/modules/Modules.cc
similarity index 100%
rename from lib/qcd/modules/Modules.cc
rename to Grid/qcd/modules/Modules.cc
diff --git a/lib/qcd/modules/Modules.h b/Grid/qcd/modules/Modules.h
similarity index 100%
rename from lib/qcd/modules/Modules.h
rename to Grid/qcd/modules/Modules.h
diff --git a/lib/qcd/modules/ObservableModules.h b/Grid/qcd/modules/ObservableModules.h
similarity index 91%
rename from lib/qcd/modules/ObservableModules.h
rename to Grid/qcd/modules/ObservableModules.h
index 7ec2bb71..87fcbb92 100644
--- a/lib/qcd/modules/ObservableModules.h
+++ b/Grid/qcd/modules/ObservableModules.h
@@ -90,6 +90,19 @@ public:
   PlaquetteMod(): ObsBase(NoParameters()){}
 };
 
+template < class Impl >
+class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
+  typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
+  using ObsBase::ObsBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->ObservablePtr.reset(new PolyakovLogger<Impl>());
+  }
+  public:
+  PolyakovMod(): ObsBase(NoParameters()){}
+};
+
 
 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
diff --git a/lib/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h
similarity index 100%
rename from lib/qcd/modules/Registration.h
rename to Grid/qcd/modules/Registration.h
diff --git a/lib/qcd/modules/SolverModules.h b/Grid/qcd/modules/SolverModules.h
similarity index 100%
rename from lib/qcd/modules/SolverModules.h
rename to Grid/qcd/modules/SolverModules.h
diff --git a/lib/qcd/modules/mods.h b/Grid/qcd/modules/mods.h
similarity index 100%
rename from lib/qcd/modules/mods.h
rename to Grid/qcd/modules/mods.h
diff --git a/lib/qcd/observables/hmc_observable.h b/Grid/qcd/observables/hmc_observable.h
similarity index 88%
rename from lib/qcd/observables/hmc_observable.h
rename to Grid/qcd/observables/hmc_observable.h
index db629ce7..c28c376d 100644
--- a/lib/qcd/observables/hmc_observable.h
+++ b/Grid/qcd/observables/hmc_observable.h
@@ -27,10 +27,9 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef HMC_OBSERVABLE_H
-#define HMC_OBSERVABLE_H
+#pragma once
 
-namespace Grid{
+NAMESPACE_BEGIN(Grid);
 
 template <class Field>
 class HmcObservable {
@@ -41,9 +40,14 @@ class HmcObservable {
                                   GridParallelRNG &pRNG) = 0;
 };
 
-}  // namespace Grid
+NAMESPACE_END(Grid);
 
+NAMESPACE_CHECK(HmcObs);
 #include "plaquette.h"
+NAMESPACE_CHECK(Plaq);
 #include "topological_charge.h"
+NAMESPACE_CHECK(Topo);
+#include "polyakov_loop.h"
+NAMESPACE_CHECK(Polyakov);
+
 
-#endif  //  HMC_OBSERVABLE_H
diff --git a/lib/qcd/observables/plaquette.h b/Grid/qcd/observables/plaquette.h
similarity index 96%
rename from lib/qcd/observables/plaquette.h
rename to Grid/qcd/observables/plaquette.h
index e8d30d12..f038e455 100644
--- a/lib/qcd/observables/plaquette.h
+++ b/Grid/qcd/observables/plaquette.h
@@ -27,8 +27,7 @@ directory
 *************************************************************************************/
 			   /*  END LEGAL */
 
-#ifndef HMC_PLAQUETTE_H
-#define HMC_PLAQUETTE_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -63,4 +62,4 @@ public:
 
 NAMESPACE_END(Grid);
 
-#endif  // HMC_PLAQUETTE_H
+
diff --git a/Grid/qcd/observables/polyakov_loop.h b/Grid/qcd/observables/polyakov_loop.h
new file mode 100644
index 00000000..0b59f549
--- /dev/null
+++ b/Grid/qcd/observables/polyakov_loop.h
@@ -0,0 +1,63 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/modules/polyakov_line.h
+
+Copyright (C) 2017
+
+Author: David Preti <david.preti@csic.es>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+// this is only defined for a gauge theory
+template <class Impl>
+class PolyakovLogger : public HmcObservable<typename Impl::Field> {
+ public:
+  // here forces the Impl to be of gauge fields
+  // if not the compiler will complain
+  INHERIT_GIMPL_TYPES(Impl);
+
+  // necessary for HmcObservable compatibility
+  typedef typename Impl::Field Field;
+
+  void TrajectoryComplete(int traj,
+                          Field &U,
+                          GridSerialRNG &sRNG,
+                          GridParallelRNG &pRNG) {
+
+    ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
+
+    int def_prec = std::cout.precision();
+
+    std::cout << GridLogMessage
+        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+        << "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
+
+    std::cout.precision(def_prec);
+
+  }
+};
+
+NAMESPACE_END(Grid);
diff --git a/lib/qcd/observables/topological_charge.h b/Grid/qcd/observables/topological_charge.h
similarity index 97%
rename from lib/qcd/observables/topological_charge.h
rename to Grid/qcd/observables/topological_charge.h
index 115b8ac0..4f116496 100644
--- a/lib/qcd/observables/topological_charge.h
+++ b/Grid/qcd/observables/topological_charge.h
@@ -27,8 +27,7 @@ directory
 *************************************************************************************/
 			   /*  END LEGAL */
 
-#ifndef HMC_TOP_CHARGE_H
-#define HMC_TOP_CHARGE_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -118,4 +117,4 @@ public:
 
 NAMESPACE_END(Grid);
 
-#endif  //  HMC_TOP_CHARGE_H
+
diff --git a/lib/qcd/representations/Representations.h b/Grid/qcd/representations/Representations.h
similarity index 100%
rename from lib/qcd/representations/Representations.h
rename to Grid/qcd/representations/Representations.h
diff --git a/lib/qcd/representations/adjoint.h b/Grid/qcd/representations/adjoint.h
similarity index 98%
rename from lib/qcd/representations/adjoint.h
rename to Grid/qcd/representations/adjoint.h
index 1e57b0d2..ee54b465 100644
--- a/lib/qcd/representations/adjoint.h
+++ b/Grid/qcd/representations/adjoint.h
@@ -22,6 +22,7 @@ public:
   typedef typename SU_Adjoint<ncolour>::LatticeAdjMatrix LatticeMatrix;
   typedef typename SU_Adjoint<ncolour>::LatticeAdjField LatticeField;
   static const int Dimension = ncolour * ncolour - 1;
+  static const bool isFundamental = false;
 
   LatticeField U;
 
diff --git a/lib/qcd/representations/fundamental.h b/Grid/qcd/representations/fundamental.h
similarity index 96%
rename from lib/qcd/representations/fundamental.h
rename to Grid/qcd/representations/fundamental.h
index 44d8e7af..4d4779e1 100644
--- a/lib/qcd/representations/fundamental.h
+++ b/Grid/qcd/representations/fundamental.h
@@ -17,6 +17,7 @@ template <int ncolour>
 class FundamentalRep {
 public:
   static const int Dimension = ncolour;
+  static const bool isFundamental = true;
 
   // typdef to be used by the Representations class in HMC to get the
   // types for the higher representation fields
diff --git a/lib/qcd/representations/hmc_types.h b/Grid/qcd/representations/hmc_types.h
similarity index 100%
rename from lib/qcd/representations/hmc_types.h
rename to Grid/qcd/representations/hmc_types.h
diff --git a/lib/qcd/representations/two_index.h b/Grid/qcd/representations/two_index.h
similarity index 98%
rename from lib/qcd/representations/two_index.h
rename to Grid/qcd/representations/two_index.h
index b292b9a6..8dfb5561 100644
--- a/lib/qcd/representations/two_index.h
+++ b/Grid/qcd/representations/two_index.h
@@ -28,6 +28,7 @@ public:
   typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
   typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
   static const int Dimension = ncolour * (ncolour + S) / 2;
+  static const bool isFundamental = false;
 
   LatticeField U;
 
diff --git a/lib/qcd/smearing/APEsmearing.h b/Grid/qcd/smearing/APEsmearing.h
similarity index 99%
rename from lib/qcd/smearing/APEsmearing.h
rename to Grid/qcd/smearing/APEsmearing.h
index 146a053e..bec97ee4 100644
--- a/lib/qcd/smearing/APEsmearing.h
+++ b/Grid/qcd/smearing/APEsmearing.h
@@ -30,8 +30,7 @@ directory
 			     @brief Declaration of Smear_APE class for APE smearing
 			   */
 
-#ifndef APE_SMEAR_
-#define APE_SMEAR_
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -151,4 +150,3 @@ public:
 
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/lib/qcd/smearing/BaseSmearing.h b/Grid/qcd/smearing/BaseSmearing.h
similarity index 96%
rename from lib/qcd/smearing/BaseSmearing.h
rename to Grid/qcd/smearing/BaseSmearing.h
index f356cfd5..163da275 100644
--- a/lib/qcd/smearing/BaseSmearing.h
+++ b/Grid/qcd/smearing/BaseSmearing.h
@@ -29,9 +29,8 @@ directory
 /*
   @brief Declares base smearing class Smear
  */
-#ifndef BASE_SMEAR_
-#define BASE_SMEAR_
-
+#pragma once
+NAMESPACE_BEGIN(Grid);
 template <class Gimpl>
 class Smear{
 public:
@@ -41,4 +40,5 @@ public:
   virtual void smear     (GaugeField&,const GaugeField&)const = 0;
   virtual void derivative(GaugeField&, const GaugeField&,const GaugeField&) const = 0;
 };
-#endif
+NAMESPACE_END(Grid);
+
diff --git a/lib/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h
similarity index 86%
rename from lib/qcd/smearing/GaugeConfiguration.h
rename to Grid/qcd/smearing/GaugeConfiguration.h
index d3cede50..f4d00c72 100644
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/Grid/qcd/smearing/GaugeConfiguration.h
@@ -3,14 +3,14 @@
 
   @brief Declares the GaugeConfiguration class
 */
-#ifndef GAUGE_CONFIG_
-#define GAUGE_CONFIG_
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
 //trivial class for no smearing
 template< class Impl >
-class NoSmearing {
+class NoSmearing
+{
 public:
   INHERIT_FIELD_TYPES(Impl);
 
@@ -24,10 +24,10 @@ public:
 
   Field& get_SmearedU() { return *ThinField; }
 
-  Field& get_U(bool smeared = false) {
+  Field &get_U(bool smeared = false)
+  {
     return *ThinField;
   }
-
 };
 
 /*!
@@ -42,7 +42,8 @@ public:
   It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration {
+class SmearedConfiguration
+{
 public:
   INHERIT_GIMPL_TYPES(Gimpl);
 
@@ -53,7 +54,8 @@ private:
 
   // Member functions
   //====================================================================
-  void fill_smearedSet(GaugeField& U) {
+  void fill_smearedSet(GaugeField &U)
+  {
     ThinLinks = &U;  // attach the smearing routine to the field U
 
     // check the pointer is not null
@@ -61,13 +63,15 @@ private:
       std::cout << GridLogError
                 << "[SmearedConfiguration] Error in ThinLinks pointer\n";
 
-    if (smearingLevels > 0) {
+    if (smearingLevels > 0)
+    {
       std::cout << GridLogDebug
                 << "[SmearedConfiguration] Filling SmearedSet\n";
       GaugeField previous_u(ThinLinks->Grid());
 
       previous_u = *ThinLinks;
-      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
+      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
+      {
         StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
         previous_u = SmearedSet[smearLvl];
 
@@ -80,7 +84,8 @@ private:
   }
   //====================================================================
   GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-                                  const GaugeField& GaugeK) const {
+                                  const GaugeField& GaugeK) const 
+  {
     GridBase* grid = GaugeK.Grid();
     GaugeField C(grid), SigmaK(grid), iLambda(grid);
     GaugeLinkField iLambda_mu(grid);
@@ -92,7 +97,8 @@ private:
     SigmaK = Zero();
     iLambda = Zero();
 
-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Nd; mu++)
+    {
       Cmu = peekLorentz(C, mu);
       GaugeKmu = peekLorentz(GaugeK, mu);
       SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
@@ -107,14 +113,16 @@ private:
   }
 
   /*! @brief Returns smeared configuration at level 'Level' */
-  const GaugeField& get_smeared_conf(int Level) const {
+  const GaugeField &get_smeared_conf(int Level) const
+  {
     return SmearedSet[Level];
   }
 
   //====================================================================
   void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
                    const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
-                   const GaugeLinkField& GaugeK) const {
+                   const GaugeLinkField& GaugeK) const 
+  {
     GridBase* grid = iQ.Grid();
     GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
     GaugeLinkField unity(grid);
@@ -206,13 +214,13 @@ private:
   //====================================================================
 public:
   GaugeField*
-  ThinLinks; /*!< @brief Pointer to the thin
-	       links configuration */
+      ThinLinks; /* Pointer to the thin links configuration */
 
-  /*! @brief Standard constructor */
+  /* Standard constructor */
   SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
                        Smear_Stout<Gimpl>& Stout)
-    : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
+      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
+  {
     for (unsigned int i = 0; i < smearingLevels; ++i)
       SmearedSet.push_back(*(new GaugeField(UGrid)));
   }
@@ -221,21 +229,29 @@ public:
   SmearedConfiguration()
     : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
 
-
-  
   // attach the smeared routines to the thin links U and fill the smeared set
-  void set_Field(GaugeField& U) { fill_smearedSet(U); }
+  void set_Field(GaugeField &U)
+  {
+    double start = usecond();
+    fill_smearedSet(U);
+    double end = usecond();
+    double time = (end - start)/ 1e3;
+    std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;  
+  }
 
   //====================================================================
-  void smeared_force(GaugeField& SigmaTilde) const {
-    if (smearingLevels > 0) {
+  void smeared_force(GaugeField &SigmaTilde) const
+  {
+    if (smearingLevels > 0)
+    {
+      double start = usecond();
       GaugeField force = SigmaTilde; // actually = U*SigmaTilde
       GaugeLinkField tmp_mu(SigmaTilde.Grid());
 
-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
         // to get just SigmaTilde
-        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
-	  peekLorentz(force, mu);
+        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu);
         pokeLorentz(force, tmp_mu, mu);
       }
 
@@ -244,33 +260,43 @@ public:
 
       force = AnalyticSmearedForce(force, *ThinLinks);
 
-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
         tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
         pokeLorentz(SigmaTilde, tmp_mu, mu);
       }
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
     }  // if smearingLevels = 0 do nothing
   }
   //====================================================================
 
   GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
 
-  GaugeField& get_U(bool smeared = false) {
+  GaugeField &get_U(bool smeared = false)
+  {
     // get the config, thin links by default
-    if (smeared) {
-      if (smearingLevels) {
+    if (smeared)
+    {
+      if (smearingLevels)
+      {
         RealD impl_plaq =
 	  WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
         std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
                   << std::endl;
         return get_SmearedU();
-
-      } else {
+      }
+      else
+      {
         RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
         std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                   << std::endl;
         return *ThinLinks;
       }
-    } else {
+    }
+    else
+    {
       RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
       std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                 << std::endl;
@@ -281,4 +307,3 @@ public:
 
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/lib/qcd/smearing/Smearing.h b/Grid/qcd/smearing/Smearing.h
similarity index 78%
rename from lib/qcd/smearing/Smearing.h
rename to Grid/qcd/smearing/Smearing.h
index 932acf86..da5ede72 100644
--- a/lib/qcd/smearing/Smearing.h
+++ b/Grid/qcd/smearing/Smearing.h
@@ -1,5 +1,4 @@
-#ifndef GRID_QCD_SMEARING_H
-#define GRID_QCD_SMEARING_H
+#pragma once
 
 #include <Grid/qcd/smearing/BaseSmearing.h>
 #include <Grid/qcd/smearing/APEsmearing.h>
@@ -7,4 +6,3 @@
 #include <Grid/qcd/smearing/GaugeConfiguration.h>
 #include <Grid/qcd/smearing/WilsonFlow.h>
 
-#endif
diff --git a/lib/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h
similarity index 98%
rename from lib/qcd/smearing/StoutSmearing.h
rename to Grid/qcd/smearing/StoutSmearing.h
index 9dddad54..f463dc81 100644
--- a/lib/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -2,8 +2,7 @@
   @file stoutSmear.hpp
   @brief Declares Stout smearing class
 */
-#ifndef STOUT_SMEAR_
-#define STOUT_SMEAR_
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -156,4 +155,3 @@ public:
 
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/lib/qcd/smearing/WilsonFlow.h b/Grid/qcd/smearing/WilsonFlow.h
similarity index 98%
rename from lib/qcd/smearing/WilsonFlow.h
rename to Grid/qcd/smearing/WilsonFlow.h
index e1bb73cf..19fd94e2 100644
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -27,8 +27,7 @@ directory
 *************************************************************************************/
 			   /*  END LEGAL */
 
-#ifndef WILSONFLOW_H
-#define WILSONFLOW_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -172,7 +171,7 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
     std::cout << "Time to evolve " << diff.count() << " s\n";
 #endif
     std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-	      << step << "  "
+		  << step << "  " << tau(step) << "  " 
 	      << energyDensityPlaquette(step,out) << std::endl;
     if( step % measure_interval == 0){
       std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
@@ -192,7 +191,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
     //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
     evolve_step_adaptive(out, maxTau);
     std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-	      << step << "  "
+		  << step << "  " << taus << "  "
 	      << energyDensityPlaquette(out) << std::endl;
     if( step % measure_interval == 0){
       std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
@@ -205,7 +204,5 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
 
 }
 
-
 NAMESPACE_END(Grid);
 
-#endif   // WILSONFLOW_H
diff --git a/lib/qcd/spin/Dirac.h b/Grid/qcd/spin/Dirac.h
similarity index 100%
rename from lib/qcd/spin/Dirac.h
rename to Grid/qcd/spin/Dirac.h
diff --git a/lib/qcd/spin/Gamma.cc b/Grid/qcd/spin/Gamma.cc
similarity index 100%
rename from lib/qcd/spin/Gamma.cc
rename to Grid/qcd/spin/Gamma.cc
diff --git a/lib/qcd/spin/Gamma.h b/Grid/qcd/spin/Gamma.h
similarity index 100%
rename from lib/qcd/spin/Gamma.h
rename to Grid/qcd/spin/Gamma.h
diff --git a/lib/qcd/spin/Spin.h b/Grid/qcd/spin/Spin.h
similarity index 100%
rename from lib/qcd/spin/Spin.h
rename to Grid/qcd/spin/Spin.h
diff --git a/lib/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h
similarity index 100%
rename from lib/qcd/spin/TwoSpinor.h
rename to Grid/qcd/spin/TwoSpinor.h
diff --git a/lib/qcd/spin/gamma-gen/gamma-gen.nb b/Grid/qcd/spin/gamma-gen/gamma-gen.nb
similarity index 70%
rename from lib/qcd/spin/gamma-gen/gamma-gen.nb
rename to Grid/qcd/spin/gamma-gen/gamma-gen.nb
index b9753df7..4167b6e2 100644
--- a/lib/qcd/spin/gamma-gen/gamma-gen.nb
+++ b/Grid/qcd/spin/gamma-gen/gamma-gen.nb
@@ -10,33 +10,34 @@
 NotebookFileLineBreakTest
 NotebookFileLineBreakTest
 NotebookDataPosition[       158,          7]
-NotebookDataLength[     56640,       1480]
-NotebookOptionsPosition[     55061,       1426]
-NotebookOutlinePosition[     55421,       1442]
-CellTagsIndexPosition[     55378,       1439]
+NotebookDataLength[     75090,       1956]
+NotebookOptionsPosition[     69536,       1867]
+NotebookOutlinePosition[     69898,       1883]
+CellTagsIndexPosition[     69855,       1880]
 WindowFrame->Normal*)
 
 (* Beginning of Notebook Content *)
 Notebook[{
 Cell[TextData[StyleBox["Grid physics library, www.github.com/paboyle/Grid \n\n\
-Source file: lib/qcd/spin/gamma-gen/gamma-gen.nb\n\nCopyright (C) 2015\n\
-Copyright (C) 2016\nCopyright (C) 2017\n\nAuthor: Antonin Portelli \
-<antonin.portelli@me.com>\n\nThis program is free software; you can \
-redistribute it and/or modify\nit under the terms of the GNU General Public \
-License as published by\nthe Free Software Foundation; either version 2 of \
-the License, or\n(at your option) any later version.\n\nThis program is \
-distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; \
-without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A \
-PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\n\
-You should have received a copy of the GNU General Public License along\nwith \
-this program; if not, write to the Free Software Foundation, Inc.,\n51 \
-Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.\n\nSee the full \
-license in the file \[OpenCurlyDoubleQuote]LICENSE\[CloseCurlyDoubleQuote] in \
-the top level distribution directory",
+Source file: lib/qcd/spin/gamma-gen/gamma-gen.nb\n\nCopyright (C) 2015-2018\n\
+\nAuthor: Antonin Portelli <antonin.portelli@me.com>\n\nThis program is free \
+software; you can redistribute it and/or modify\nit under the terms of the \
+GNU General Public License as published by\nthe Free Software Foundation; \
+either version 2 of the License, or\n(at your option) any later version.\n\n\
+This program is distributed in the hope that it will be useful,\nbut WITHOUT \
+ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or \
+FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for \
+more details.\n\nYou should have received a copy of the GNU General Public \
+License along\nwith this program; if not, write to the Free Software \
+Foundation, Inc.,\n51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 \
+USA.\n\nSee the full license in the file \[OpenCurlyDoubleQuote]LICENSE\
+\[CloseCurlyDoubleQuote] in the top level distribution directory",
  Background->RGBColor[1, 1, 0.85]]], "Text",
  CellChangeTimes->{{3.694192702097444*^9, 3.694192776106186*^9}, {
-  3.6942089129622507`*^9, 3.694208916624515*^9}},
- Background->RGBColor[1, 1, 0.85]],
+  3.6942089129622507`*^9, 3.694208916624515*^9}, {3.743227925290299*^9, 
+  3.743227927744316*^9}},
+ Background->RGBColor[
+  1, 1, 0.85],ExpressionUUID->"f0373ef0-8c33-4e9a-9f09-9bf718da72ef"],
 
 Cell[CellGroupData[{
 
@@ -50,7 +51,8 @@ Cell[BoxData[{
   RowBox[{"NotebookDirectory", "[", "]"}], "]"}]}], "Input",
  CellChangeTimes->{{3.69418610909842*^9, 3.694186122331771*^9}, {
   3.694189694542165*^9, 3.6941897146300087`*^9}, {3.694297413625847*^9, 
-  3.694297419003489*^9}}],
+  3.694297419003489*^9}},ExpressionUUID->"5c937a3e-adfd-4d7e-8fde-\
+afb3337c72d9"],
 
 Cell[BoxData["\<\"/Users/antonin/Development/Grid/lib/qcd/spin/gamma-gen\"\>"]\
 , "Output",
@@ -67,13 +69,245 @@ Cell[BoxData["\<\"/Users/antonin/Development/Grid/lib/qcd/spin/gamma-gen\"\>"]\
    3.6942984168973837`*^9, 3.6948949168128557`*^9, 3.69489495942482*^9, 
    3.6949627104409447`*^9, {3.694962842273374*^9, 3.6949628507953672`*^9}, 
    3.694963445768766*^9, 3.6949643976358423`*^9, {3.694964715764683*^9, 
-   3.6949647261937733`*^9}, 3.6949650592533703`*^9, 3.694965165070952*^9}]
+   3.6949647261937733`*^9}, 3.6949650592533703`*^9, 3.694965165070952*^9, 
+   3.74322794542139*^9},ExpressionUUID->"72817ba6-2f6a-4a4d-8212-\
+6f0970f49e7c"]
+}, Open  ]],
+
+Cell[CellGroupData[{
+
+Cell[BoxData[
+ RowBox[{"FactorInteger", "[", "3152", "]"}]], "Input",
+ CellChangeTimes->{{3.7432347536316767`*^9, 3.7432347764739027`*^9}, {
+  3.743234833567358*^9, 
+  3.743234862146022*^9}},ExpressionUUID->"d1a0fd03-85e1-43af-ba80-\
+3ca4235675d8"],
+
+Cell[BoxData[
+ RowBox[{"{", 
+  RowBox[{
+   RowBox[{"{", 
+    RowBox[{"2", ",", "4"}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{"197", ",", "1"}], "}"}]}], "}"}]], "Output",
+ CellChangeTimes->{{3.743234836792224*^9, 
+  3.743234862493619*^9}},ExpressionUUID->"16d3f953-4b24-4ed2-ae62-\
+306dcab66ca7"]
+}, Open  ]],
+
+Cell[CellGroupData[{
+
+Cell[BoxData[
+ RowBox[{"sol", "=", 
+  RowBox[{"Solve", "[", 
+   RowBox[{
+    RowBox[{
+     RowBox[{
+      SuperscriptBox["x", "2"], "+", 
+      SuperscriptBox["y", "2"], "+", 
+      SuperscriptBox["z", "2"]}], "\[Equal]", "2"}], ",", 
+    RowBox[{"{", 
+     RowBox[{"x", ",", "y", ",", "z"}], "}"}], ",", "Integers"}], 
+   "]"}]}]], "Input",
+ CellChangeTimes->{{3.743235304127721*^9, 
+  3.7432353087929983`*^9}},ExpressionUUID->"f0fa2a5c-3d81-4d75-a447-\
+50c7ca3459ff"],
+
+Cell[BoxData[
+ RowBox[{"{", 
+  RowBox[{
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"y", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"z", "\[Rule]", "0"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"y", "\[Rule]", "0"}], ",", 
+     RowBox[{"z", "\[Rule]", 
+      RowBox[{"-", "1"}]}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"y", "\[Rule]", "0"}], ",", 
+     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"y", "\[Rule]", "1"}], ",", 
+     RowBox[{"z", "\[Rule]", "0"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "0"}], ",", 
+     RowBox[{"y", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"z", "\[Rule]", 
+      RowBox[{"-", "1"}]}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "0"}], ",", 
+     RowBox[{"y", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "0"}], ",", 
+     RowBox[{"y", "\[Rule]", "1"}], ",", 
+     RowBox[{"z", "\[Rule]", 
+      RowBox[{"-", "1"}]}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "0"}], ",", 
+     RowBox[{"y", "\[Rule]", "1"}], ",", 
+     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "1"}], ",", 
+     RowBox[{"y", "\[Rule]", 
+      RowBox[{"-", "1"}]}], ",", 
+     RowBox[{"z", "\[Rule]", "0"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "1"}], ",", 
+     RowBox[{"y", "\[Rule]", "0"}], ",", 
+     RowBox[{"z", "\[Rule]", 
+      RowBox[{"-", "1"}]}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "1"}], ",", 
+     RowBox[{"y", "\[Rule]", "0"}], ",", 
+     RowBox[{"z", "\[Rule]", "1"}]}], "}"}], ",", 
+   RowBox[{"{", 
+    RowBox[{
+     RowBox[{"x", "\[Rule]", "1"}], ",", 
+     RowBox[{"y", "\[Rule]", "1"}], ",", 
+     RowBox[{"z", "\[Rule]", "0"}]}], "}"}]}], "}"}]], "Output",
+ CellChangeTimes->{{3.743235305220907*^9, 
+  3.743235309139554*^9}},ExpressionUUID->"d9825c95-24bb-442a-8734-\
+4c0f47e99dfc"]
+}, Open  ]],
+
+Cell[BoxData[
+ RowBox[{
+  RowBox[{"xmlElem", "[", "x_", "]"}], ":=", 
+  RowBox[{"Print", "[", 
+   RowBox[{"\"\<<elem>\>\"", "<>", 
+    RowBox[{"ToString", "[", 
+     RowBox[{"x", "[", 
+      RowBox[{"[", "1", "]"}], "]"}], "]"}], "<>", "\"\< \>\"", "<>", 
+    RowBox[{"ToString", "[", 
+     RowBox[{"x", "[", 
+      RowBox[{"[", "2", "]"}], "]"}], "]"}], "<>", "\"\< \>\"", "<>", 
+    RowBox[{"ToString", "[", 
+     RowBox[{"x", "[", 
+      RowBox[{"[", "3", "]"}], "]"}], "]"}], "<>", "\"\<</elem>\>\""}], 
+   "]"}]}]], "Input",
+ CellChangeTimes->{{3.74323534002862*^9, 3.743235351000985*^9}, {
+  3.743235403233039*^9, 3.743235413488028*^9}, {3.743235473169856*^9, 
+  3.7432354747126904`*^9}},ExpressionUUID->"aea76313-c89e-45e8-b429-\
+3f454091666d"],
+
+Cell[CellGroupData[{
+
+Cell[BoxData[
+ RowBox[{
+  RowBox[{
+   RowBox[{"xmlElem", "[", 
+    RowBox[{
+     RowBox[{"{", 
+      RowBox[{"x", ",", "y", ",", "z"}], "}"}], "/.", "#"}], "]"}], "&"}], "/@",
+   "sol"}]], "Input",
+ CellChangeTimes->{{3.743235415820318*^9, 
+  3.743235467025091*^9}},ExpressionUUID->"07da3998-8eab-40ba-8c0b-\
+ac6b130cb4fb"],
+
+Cell[CellGroupData[{
+
+Cell[BoxData["\<\"<elem>-1 -1 0</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476581676*^9},ExpressionUUID->"c577ba06-b67a-405a-9ff5-\
+2bf7dc898d03"],
+
+Cell[BoxData["\<\"<elem>-1 0 -1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476588011*^9},ExpressionUUID->"d041aa36-0cea-457c-9d4b-\
+1fe9be66e2ab"],
+
+Cell[BoxData["\<\"<elem>-1 0 1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476596887*^9},ExpressionUUID->"bf141b55-86b2-4430-a994-\
+5c03d5a19441"],
+
+Cell[BoxData["\<\"<elem>-1 1 0</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476605785*^9},ExpressionUUID->"4968a660-4ecf-4b66-9071-\
+8bd798c18d21"],
+
+Cell[BoxData["\<\"<elem>0 -1 -1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476613523*^9},ExpressionUUID->"4e22d943-2680-416b-a1d7-\
+a16ca20b781f"],
+
+Cell[BoxData["\<\"<elem>0 -1 1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.7432354766218576`*^9},ExpressionUUID->"6dd38385-08b3-4dd9-932f-\
+98a00c6db1b2"],
+
+Cell[BoxData["\<\"<elem>0 1 -1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476629427*^9},ExpressionUUID->"ef3baad3-91d1-4735-9a22-\
+53495a624c15"],
+
+Cell[BoxData["\<\"<elem>0 1 1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476638257*^9},ExpressionUUID->"413fbb68-5017-4272-a62a-\
+fa234e6daaea"],
+
+Cell[BoxData["\<\"<elem>1 -1 0</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476646203*^9},ExpressionUUID->"3a832a60-ae00-414b-a9ac-\
+f5e86e67e917"],
+
+Cell[BoxData["\<\"<elem>1 0 -1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476653907*^9},ExpressionUUID->"bfc79ef6-f6c7-4f1e-88e8-\
+005ac314be9c"],
+
+Cell[BoxData["\<\"<elem>1 0 1</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.743235476662575*^9},ExpressionUUID->"0f892891-f885-489c-9925-\
+ddef4d698410"],
+
+Cell[BoxData["\<\"<elem>1 1 0</elem>\"\>"], "Print",
+ CellChangeTimes->{
+  3.7432354766702337`*^9},ExpressionUUID->"2906f190-e673-4f33-9c34-\
+e8e56efe7a27"]
+}, Open  ]],
+
+Cell[BoxData[
+ RowBox[{"{", 
+  RowBox[{
+  "Null", ",", "Null", ",", "Null", ",", "Null", ",", "Null", ",", "Null", 
+   ",", "Null", ",", "Null", ",", "Null", ",", "Null", ",", "Null", ",", 
+   "Null"}], "}"}]], "Output",
+ CellChangeTimes->{
+  3.7432354246225967`*^9, {3.7432354674878073`*^9, 
+   3.743235476678007*^9}},ExpressionUUID->"500ca3c1-88d8-46e5-a1a1-\
+86a7878e5638"]
 }, Open  ]],
 
 Cell[CellGroupData[{
 
 Cell["Clifford algebra generation", "Section",
- CellChangeTimes->{{3.6942089434583883`*^9, 3.694208978559093*^9}}],
+ CellChangeTimes->{{3.6942089434583883`*^9, 
+  3.694208978559093*^9}},ExpressionUUID->"a5b064b3-3011-4922-8559-\
+ead857cad102"],
 
 Cell[BoxData[{
  RowBox[{
@@ -89,11 +323,15 @@ Cell[BoxData[{
    RowBox[{"a", ".", "b"}], "+", 
    RowBox[{"b", ".", "a"}]}]}]}], "Input",
  CellChangeTimes->{{3.694184330267939*^9, 3.694184337479828*^9}, {
-  3.694184821238667*^9, 3.6941848260602217`*^9}}],
+  3.694184821238667*^9, 
+  3.6941848260602217`*^9}},ExpressionUUID->"aa28f02b-31e1-4df2-9b5d-\
+482177464b59"],
 
 Cell["Definition of the matrix representation of the algebra:", "Text",
  CellChangeTimes->{{3.6942090405172586`*^9, 3.694209073962101*^9}, {
-  3.6942974330697393`*^9, 3.694297433821431*^9}}],
+  3.6942974330697393`*^9, 
+  3.694297433821431*^9}},ExpressionUUID->"c8896b88-f1db-4ce4-b7a6-\
+0c9838bdb8f1"],
 
 Cell[BoxData[{
  RowBox[{
@@ -263,10 +501,190 @@ Cell[BoxData[{
    3.694185044355978*^9, 3.694185099415689*^9}, {3.694185502749824*^9, 
    3.694185675128971*^9}, {3.694185728773429*^9, 3.694185729056695*^9}, 
    3.694185780274218*^9, 3.6941858224264593`*^9, {3.694185941282981*^9, 
-   3.694185950262871*^9}}],
+   3.694185950262871*^9}},ExpressionUUID->"52a96ff6-047e-4043-86d0-\
+e303866e5f8e"],
+
+Cell[CellGroupData[{
+
+Cell[BoxData[
+ RowBox[{"Simplify", "[", 
+  RowBox[{
+   RowBox[{
+    RowBox[{"Conjugate", "[", 
+     RowBox[{"{", 
+      RowBox[{"w0", ",", "w1", ",", "w2", ",", "w3"}], "}"}], "]"}], ".", 
+    RowBox[{"(", 
+     RowBox[{
+      RowBox[{"(", 
+       RowBox[{
+        RowBox[{"a0", " ", 
+         RowBox[{"mat", "[", "gx", "]"}]}], "+", 
+        RowBox[{"a1", " ", 
+         RowBox[{"mat", "[", "gy", "]"}]}], "+", 
+        RowBox[{"a2", " ", 
+         RowBox[{"mat", "[", "gz", "]"}]}], "+", 
+        RowBox[{"a3", " ", 
+         RowBox[{"mat", "[", "gt", "]"}]}]}], ")"}], "/.", 
+      RowBox[{"{", 
+       RowBox[{
+        RowBox[{"a0", "\[Rule]", 
+         RowBox[{
+          FractionBox["1", 
+           RowBox[{"2", "\[ImaginaryI]"}]], 
+          RowBox[{"(", 
+           RowBox[{"b0", "-", 
+            RowBox[{"Conjugate", "[", "b0", "]"}]}], ")"}]}]}], ",", 
+        RowBox[{"a1", "\[Rule]", 
+         RowBox[{
+          FractionBox["1", "2"], 
+          RowBox[{"(", 
+           RowBox[{"b0", "+", 
+            RowBox[{"Conjugate", "[", "b0", "]"}]}], ")"}]}]}], ",", 
+        RowBox[{"a2", "\[Rule]", 
+         RowBox[{
+          FractionBox["1", 
+           RowBox[{"2", "\[ImaginaryI]"}]], 
+          RowBox[{"(", 
+           RowBox[{"b1", "-", 
+            RowBox[{"Conjugate", "[", "b1", "]"}]}], ")"}]}]}], ",", 
+        RowBox[{"a3", "\[Rule]", 
+         RowBox[{
+          FractionBox["1", "2"], 
+          RowBox[{"(", 
+           RowBox[{"b1", "+", 
+            RowBox[{"Conjugate", "[", "b1", "]"}]}], ")"}]}]}]}], "}"}]}], 
+     ")"}], ".", 
+    RowBox[{"{", 
+     RowBox[{"v0", ",", "v1", ",", "v2", ",", "v3"}], "}"}]}], "//", 
+   "MatrixForm"}], "]"}]], "Input",
+ CellChangeTimes->{{3.7432279607971277`*^9, 3.743228006752531*^9}, {
+   3.7432280803839817`*^9, 3.743228145366938*^9}, 3.743228258639215*^9, {
+   3.743228553975891*^9, 3.743228599277994*^9}, {3.743228634543832*^9, 
+   3.743228743174139*^9}, {3.743228787080687*^9, 3.7432287981829977`*^9}, {
+   3.743228839944178*^9, 3.743228849469906*^9}, {3.743340578292872*^9, 
+   3.743340582859209*^9}, {3.743342105414857*^9, 
+   3.7433421141838713`*^9}},ExpressionUUID->"8b0f4955-2c3f-418c-9226-\
+9be8f87621e8"],
+
+Cell[BoxData[
+ TagBox[
+  RowBox[{
+   RowBox[{"b1", " ", "v2", " ", 
+    RowBox[{"Conjugate", "[", "w0", "]"}]}], "-", 
+   RowBox[{"v3", " ", 
+    RowBox[{"Conjugate", "[", "b0", "]"}], " ", 
+    RowBox[{"Conjugate", "[", "w0", "]"}]}], "+", 
+   RowBox[{"b0", " ", "v2", " ", 
+    RowBox[{"Conjugate", "[", "w1", "]"}]}], "+", 
+   RowBox[{"v3", " ", 
+    RowBox[{"Conjugate", "[", "b1", "]"}], " ", 
+    RowBox[{"Conjugate", "[", "w1", "]"}]}], "+", 
+   RowBox[{"v1", " ", 
+    RowBox[{"Conjugate", "[", "b0", "]"}], " ", 
+    RowBox[{"Conjugate", "[", "w2", "]"}]}], "+", 
+   RowBox[{"v0", " ", 
+    RowBox[{"Conjugate", "[", "b1", "]"}], " ", 
+    RowBox[{"Conjugate", "[", "w2", "]"}]}], "-", 
+   RowBox[{"b0", " ", "v0", " ", 
+    RowBox[{"Conjugate", "[", "w3", "]"}]}], "+", 
+   RowBox[{"b1", " ", "v1", " ", 
+    RowBox[{"Conjugate", "[", "w3", "]"}]}]}],
+  Function[BoxForm`e$, 
+   MatrixForm[BoxForm`e$]]]], "Output",
+ CellChangeTimes->{
+  3.743343313742465*^9},ExpressionUUID->"edd0619f-6f12-4070-a1d2-\
+6b547877fadc"]
+}, Open  ]],
+
+Cell[CellGroupData[{
+
+Cell[BoxData[
+ RowBox[{"Simplify", "[", 
+  RowBox[{
+   RowBox[{"(", 
+    RowBox[{
+     RowBox[{"(", 
+      RowBox[{
+       RowBox[{"a0", " ", 
+        RowBox[{"mat", "[", "gx", "]"}]}], "+", 
+       RowBox[{"a1", " ", 
+        RowBox[{"mat", "[", "gy", "]"}]}], "+", 
+       RowBox[{"a2", " ", 
+        RowBox[{"mat", "[", "gz", "]"}]}], "+", 
+       RowBox[{"a3", " ", 
+        RowBox[{"mat", "[", "gt", "]"}]}]}], ")"}], "/.", 
+     RowBox[{"{", 
+      RowBox[{
+       RowBox[{"a0", "\[Rule]", 
+        RowBox[{
+         FractionBox["1", 
+          RowBox[{"2", "\[ImaginaryI]"}]], 
+         RowBox[{"(", 
+          RowBox[{"b0", "-", 
+           RowBox[{"Conjugate", "[", "b0", "]"}]}], ")"}]}]}], ",", 
+       RowBox[{"a1", "\[Rule]", 
+        RowBox[{
+         FractionBox["1", "2"], 
+         RowBox[{"(", 
+          RowBox[{"b0", "+", 
+           RowBox[{"Conjugate", "[", "b0", "]"}]}], ")"}]}]}], ",", 
+       RowBox[{"a2", "\[Rule]", 
+        RowBox[{
+         FractionBox["1", 
+          RowBox[{"2", "\[ImaginaryI]"}]], 
+         RowBox[{"(", 
+          RowBox[{"b1", "-", 
+           RowBox[{"Conjugate", "[", "b1", "]"}]}], ")"}]}]}], ",", 
+       RowBox[{"a3", "\[Rule]", 
+        RowBox[{
+         FractionBox["1", "2"], 
+         RowBox[{"(", 
+          RowBox[{"b1", "+", 
+           RowBox[{"Conjugate", "[", "b1", "]"}]}], ")"}]}]}]}], "}"}]}], 
+    ")"}], "//", "MatrixForm"}], "]"}]], "Input",
+ CellChangeTimes->{{3.7433421933181667`*^9, 
+  3.743342200786813*^9}},ExpressionUUID->"fb45123c-c610-4075-99b0-\
+7cd71c728ae7"],
+
+Cell[BoxData[
+ TagBox[
+  RowBox[{"(", "\[NoBreak]", GridBox[{
+     {"0", "0", "b1", 
+      RowBox[{"-", 
+       RowBox[{"Conjugate", "[", "b0", "]"}]}]},
+     {"0", "0", "b0", 
+      RowBox[{"Conjugate", "[", "b1", "]"}]},
+     {
+      RowBox[{"Conjugate", "[", "b1", "]"}], 
+      RowBox[{"Conjugate", "[", "b0", "]"}], "0", "0"},
+     {
+      RowBox[{"-", "b0"}], "b1", "0", "0"}
+    },
+    GridBoxAlignment->{
+     "Columns" -> {{Center}}, "ColumnsIndexed" -> {}, "Rows" -> {{Baseline}}, 
+      "RowsIndexed" -> {}},
+    GridBoxSpacings->{"Columns" -> {
+        Offset[0.27999999999999997`], {
+         Offset[0.7]}, 
+        Offset[0.27999999999999997`]}, "ColumnsIndexed" -> {}, "Rows" -> {
+        Offset[0.2], {
+         Offset[0.4]}, 
+        Offset[0.2]}, "RowsIndexed" -> {}}], "\[NoBreak]", ")"}],
+  Function[BoxForm`e$, 
+   MatrixForm[BoxForm`e$]]]], "Output",
+ CellChangeTimes->{{3.743228092720749*^9, 3.743228103044189*^9}, 
+   3.743228145808625*^9, 3.743228259025098*^9, 3.743228600081357*^9, {
+   3.743228667370229*^9, 3.743228688052019*^9}, {3.743228718056776*^9, 
+   3.743228743610117*^9}, 3.743228799225634*^9, 3.7432288499987583`*^9, {
+   3.743340583491489*^9, 3.7433405956062613`*^9}, 3.743342123072051*^9, 
+   3.743342201226069*^9},ExpressionUUID->"2ae14565-b412-4dc0-9dce-\
+bd6c1ba5ef27"]
+}, Open  ]],
 
 Cell["Generation of the abstract algebra:", "Text",
- CellChangeTimes->{{3.6942090658330803`*^9, 3.694209076132119*^9}}],
+ CellChangeTimes->{{3.6942090658330803`*^9, 
+  3.694209076132119*^9}},ExpressionUUID->"af247231-a58d-417b-987a-\
+26908dafffdb"],
 
 Cell[BoxData[{
  RowBox[{"Do", "[", 
@@ -331,10 +749,14 @@ Cell[BoxData[{
  CellChangeTimes->{{3.6941860329437103`*^9, 3.6941860343133917`*^9}, {
   3.694186163571176*^9, 3.6941862016761427`*^9}, {3.69418700219066*^9, 
   3.6941870425469627`*^9}, {3.694297326197534*^9, 3.6942974062629423`*^9}, {
-  3.694297634175386*^9, 3.6942976496897383`*^9}}],
+  3.694297634175386*^9, 
+  3.6942976496897383`*^9}},ExpressionUUID->"7c44cadd-e488-4f51-87d8-\
+c64eef11f40c"],
 
 Cell["Check that we can reconstruct the Euclidean metric:", "Text",
- CellChangeTimes->{{3.694209080190936*^9, 3.694209096585559*^9}}],
+ CellChangeTimes->{{3.694209080190936*^9, 
+  3.694209096585559*^9}},ExpressionUUID->"856f1746-1107-4509-a5ce-\
+ac9c7f56cdb1"],
 
 Cell[CellGroupData[{
 
@@ -353,7 +775,8 @@ Cell[BoxData[
      RowBox[{"i2", ",", 
       RowBox[{"{", 
        RowBox[{"gx", ",", "gy", ",", "gz", ",", "gt"}], "}"}]}], "}"}]}], 
-   "]"}], "//", "MatrixForm"}]], "Input"],
+   "]"}], "//", "MatrixForm"}]], "Input",ExpressionUUID->"8674484a-8543-434f-\
+b177-3b27f9353212"],
 
 Cell[BoxData[
  TagBox[
@@ -389,11 +812,14 @@ Cell[BoxData[
    3.694894917294997*^9, 3.6948949597758904`*^9, 3.6949627108824663`*^9, 
    3.694962851174364*^9, 3.6949634461305313`*^9, 3.694964397971891*^9, {
    3.6949647161810303`*^9, 3.6949647264866943`*^9}, 3.6949650598407507`*^9, 
-   3.694965165456048*^9}]
+   3.694965165456048*^9},ExpressionUUID->"c3b3f84d-91f6-41af-af6b-\
+a394ca020511"]
 }, Open  ]],
 
 Cell["Full multiplication table:", "Text",
- CellChangeTimes->{{3.694209113187169*^9, 3.6942091210767593`*^9}}],
+ CellChangeTimes->{{3.694209113187169*^9, 
+  3.6942091210767593`*^9}},ExpressionUUID->"518a3040-54b1-4d43-8947-\
+5c7d12efa94d"],
 
 Cell[CellGroupData[{
 
@@ -409,7 +835,9 @@ Cell[BoxData[
   "MatrixForm"}]], "Input",
  CellChangeTimes->{{3.6941862426584797`*^9, 3.694186256858178*^9}, {
   3.694186605271886*^9, 3.694186617894228*^9}, {3.694186972131384*^9, 
-  3.69418697419895*^9}, {3.694192885918524*^9, 3.694192888888296*^9}}],
+  3.69418697419895*^9}, {3.694192885918524*^9, 
+  3.694192888888296*^9}},ExpressionUUID->"61a2e974-2b39-4a07-8043-\
+2dfd39a70569"],
 
 Cell[BoxData[
  TagBox[
@@ -577,7 +1005,8 @@ Cell[BoxData[
    3.694894917375866*^9, 3.694894959839177*^9, 3.694962710968522*^9, 
    3.6949628512863817`*^9, 3.694963446206002*^9, 3.694964398046623*^9, {
    3.6949647162797327`*^9, 3.694964726526013*^9}, 3.6949650599380713`*^9, 
-   3.694965165531089*^9}]
+   3.694965165531089*^9},ExpressionUUID->"73480ac0-3043-4077-80cc-\
+b952a94c822a"]
 }, Open  ]]
 }, Open  ]],
 
@@ -585,11 +1014,15 @@ Cell[CellGroupData[{
 
 Cell["Header file Gamma.h generation", "Section",
  CellChangeTimes->{{3.694208986784461*^9, 3.6942090005062523`*^9}, {
-  3.694965123390101*^9, 3.694965123950851*^9}}],
+  3.694965123390101*^9, 
+  3.694965123950851*^9}},ExpressionUUID->"4e833cd6-9f0e-4aa3-a873-\
+3d579e874720"],
 
 Cell["File skeleton:", "Text",
  CellFrame->{{0, 0}, {0, 0.5}},
- CellChangeTimes->{{3.694209131604498*^9, 3.694209133792495*^9}}],
+ CellChangeTimes->{{3.694209131604498*^9, 
+  3.694209133792495*^9}},ExpressionUUID->"6d27fc04-3a60-4e03-8df7-\
+3dd3aeee35b4"],
 
 Cell[BoxData[
  RowBox[{
@@ -643,12 +1076,14 @@ Algebra                                                      g;\n  public:\n  \
    3.694963343265525*^9}, {3.694964367519239*^9, 3.69496439461199*^9}, {
    3.694964462130747*^9, 3.6949644669959793`*^9}, 3.694964509762739*^9, {
    3.694964705045744*^9, 3.694964723148797*^9}, {3.694964992988984*^9, 
-   3.6949649968504257`*^9}}],
+   3.6949649968504257`*^9}},ExpressionUUID->"c7103bd6-b539-4495-b98c-\
+d4d12ac6cad8"],
 
 Cell["Gamma enum generation:", "Text",
  CellFrame->{{0, 0}, {0, 0.5}},
  CellChangeTimes->{{3.694209168488991*^9, 3.6942091715073423`*^9}, 
-   3.694209215969149*^9}],
+   3.694209215969149*^9},ExpressionUUID->"0625593d-290f-4a39-9d80-\
+8e2c6fdbc94e"],
 
 Cell[BoxData[{
  RowBox[{
@@ -798,7 +1233,9 @@ Cell[BoxData[{
  CellChangeTimes->{{3.69418665896658*^9, 3.6941867305497723`*^9}, {
   3.694186782865391*^9, 3.694186840513199*^9}, {3.694186889568404*^9, 
   3.694186968177154*^9}, {3.6941870767730503`*^9, 3.69418716300373*^9}, {
-  3.694213209628356*^9, 3.6942132459364033`*^9}}],
+  3.694213209628356*^9, 
+  3.6942132459364033`*^9}},ExpressionUUID->"1ad4904c-352f-4b1d-a7c7-\
+91e1b0549409"],
 
 Cell[BoxData[
  RowBox[{
@@ -855,11 +1292,14 @@ Cell[BoxData[
   3.694206752732321*^9, 3.694206753090602*^9}, {3.6942071072527027`*^9, 
   3.694207214318696*^9}, {3.694211442308366*^9, 3.694211490100521*^9}, {
   3.6942115668101377`*^9, 3.694211571070611*^9}, {3.6942133172135267`*^9, 
-  3.694213325783718*^9}}],
+  3.694213325783718*^9}},ExpressionUUID->"0221674f-9b63-4662-91bc-\
+ccc8c6ae9589"],
 
 Cell["Multiplication functions generation:", "Text",
  CellFrame->{{0, 0}, {0, 0.5}},
- CellChangeTimes->{{3.69420919761381*^9, 3.694209206431526*^9}}],
+ CellChangeTimes->{{3.69420919761381*^9, 
+  3.694209206431526*^9}},ExpressionUUID->"d2d2257a-487b-416f-bc40-\
+abd4482225f7"],
 
 Cell[BoxData[{
  RowBox[{
@@ -1257,7 +1697,8 @@ iMatrix<vtype, Ns>>::type\n{\n  iMatrix<vtype, Ns> ret;\n\n  switch (G.g) \n  \
    3.694214921431739*^9, 3.694214951876449*^9}, {3.6942151046483088`*^9, 
    3.694215118335286*^9}, {3.6942151634191313`*^9, 3.694215188429871*^9}, {
    3.6942940839999113`*^9, 3.694294090686364*^9}, {3.69489488486012*^9, 
-   3.6948948916252403`*^9}}],
+   3.6948948916252403`*^9}},ExpressionUUID->"daea68a9-c9e8-46ab-9bc8-\
+5186e2cf477c"],
 
 Cell["Header file generation:", "Text",
  CellFrame->{{0, 0}, {0, 0.5}},
@@ -1425,9 +1866,9 @@ Cell[BoxData[""], "Input",
 }, Open  ]]
 },
 WindowSize->{1246, 1005},
-WindowMargins->{{64, Automatic}, {Automatic, 0}},
-FrontEndVersion->"11.0 for Mac OS X x86 (32-bit, 64-bit Kernel) (September \
-21, 2016)",
+WindowMargins->{{282, Automatic}, {Automatic, 14}},
+FrontEndVersion->"11.2 for Mac OS X x86 (32-bit, 64-bit Kernel) (September \
+10, 2017)",
 StyleDefinitions->"Default.nb"
 ]
 (* End of Notebook Content *)
@@ -1441,46 +1882,81 @@ CellTagsIndex->{}
 *)
 (*NotebookFileOutline
 Notebook[{
-Cell[558, 20, 1295, 18, 502, "Text"],
+Cell[558, 20, 1365, 19, 557, "Text",ExpressionUUID->"f0373ef0-8c33-4e9a-9f09-9bf718da72ef"],
 Cell[CellGroupData[{
-Cell[1878, 42, 513, 10, 75, "Input"],
-Cell[2394, 54, 1090, 15, 32, "Output"]
+Cell[1948, 43, 570, 11, 73, "Input",ExpressionUUID->"5c937a3e-adfd-4d7e-8fde-afb3337c72d9"],
+Cell[2521, 56, 1172, 17, 34, "Output",ExpressionUUID->"72817ba6-2f6a-4a4d-8212-6f0970f49e7c"]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[3521, 74, 114, 1, 64, "Section"],
-Cell[3638, 77, 475, 14, 54, "Input"],
-Cell[4116, 93, 190, 2, 30, "Text"],
-Cell[4309, 97, 5454, 168, 427, "Input"],
-Cell[9766, 267, 119, 1, 30, "Text"],
-Cell[9888, 270, 2115, 63, 96, "Input"],
-Cell[12006, 335, 133, 1, 30, "Text"],
-Cell[CellGroupData[{
-Cell[12164, 340, 479, 15, 32, "Input"],
-Cell[12646, 357, 1648, 34, 96, "Output"]
+Cell[3730, 78, 248, 5, 30, "Input",ExpressionUUID->"d1a0fd03-85e1-43af-ba80-3ca4235675d8"],
+Cell[3981, 85, 299, 9, 34, "Output",ExpressionUUID->"16d3f953-4b24-4ed2-ae62-306dcab66ca7"]
 }, Open  ]],
-Cell[14309, 394, 110, 1, 30, "Text"],
 Cell[CellGroupData[{
-Cell[14444, 399, 476, 12, 32, "Input"],
-Cell[14923, 413, 6697, 166, 312, "Output"]
+Cell[4317, 99, 469, 14, 33, "Input",ExpressionUUID->"f0fa2a5c-3d81-4d75-a447-50c7ca3459ff"],
+Cell[4789, 115, 2423, 77, 56, "Output",ExpressionUUID->"d9825c95-24bb-442a-8734-4c0f47e99dfc"]
+}, Open  ]],
+Cell[7227, 195, 751, 18, 30, "Input",ExpressionUUID->"aea76313-c89e-45e8-b429-3f454091666d"],
+Cell[CellGroupData[{
+Cell[8003, 217, 323, 10, 30, "Input",ExpressionUUID->"07da3998-8eab-40ba-8c0b-ac6b130cb4fb"],
+Cell[CellGroupData[{
+Cell[8351, 231, 156, 3, 24, "Print",ExpressionUUID->"c577ba06-b67a-405a-9ff5-2bf7dc898d03"],
+Cell[8510, 236, 156, 3, 24, "Print",ExpressionUUID->"d041aa36-0cea-457c-9d4b-1fe9be66e2ab"],
+Cell[8669, 241, 155, 3, 24, "Print",ExpressionUUID->"bf141b55-86b2-4430-a994-5c03d5a19441"],
+Cell[8827, 246, 155, 3, 24, "Print",ExpressionUUID->"4968a660-4ecf-4b66-9071-8bd798c18d21"],
+Cell[8985, 251, 156, 3, 24, "Print",ExpressionUUID->"4e22d943-2680-416b-a1d7-a16ca20b781f"],
+Cell[9144, 256, 157, 3, 24, "Print",ExpressionUUID->"6dd38385-08b3-4dd9-932f-98a00c6db1b2"],
+Cell[9304, 261, 155, 3, 24, "Print",ExpressionUUID->"ef3baad3-91d1-4735-9a22-53495a624c15"],
+Cell[9462, 266, 154, 3, 24, "Print",ExpressionUUID->"413fbb68-5017-4272-a62a-fa234e6daaea"],
+Cell[9619, 271, 155, 3, 24, "Print",ExpressionUUID->"3a832a60-ae00-414b-a9ac-f5e86e67e917"],
+Cell[9777, 276, 155, 3, 24, "Print",ExpressionUUID->"bfc79ef6-f6c7-4f1e-88e8-005ac314be9c"],
+Cell[9935, 281, 154, 3, 24, "Print",ExpressionUUID->"0f892891-f885-489c-9925-ddef4d698410"],
+Cell[10092, 286, 156, 3, 24, "Print",ExpressionUUID->"2906f190-e673-4f33-9c34-e8e56efe7a27"]
+}, Open  ]],
+Cell[10263, 292, 376, 9, 34, "Output",ExpressionUUID->"500ca3c1-88d8-46e5-a1a1-86a7878e5638"]
+}, Open  ]],
+Cell[CellGroupData[{
+Cell[10676, 306, 174, 3, 67, "Section",ExpressionUUID->"a5b064b3-3011-4922-8559-ead857cad102"],
+Cell[10853, 311, 535, 16, 52, "Input",ExpressionUUID->"aa28f02b-31e1-4df2-9b5d-482177464b59"],
+Cell[11391, 329, 250, 4, 35, "Text",ExpressionUUID->"c8896b88-f1db-4ce4-b7a6-0c9838bdb8f1"],
+Cell[11644, 335, 5511, 169, 425, "Input",ExpressionUUID->"52a96ff6-047e-4043-86d0-e303866e5f8e"],
+Cell[CellGroupData[{
+Cell[17180, 508, 2183, 58, 135, "Input",ExpressionUUID->"8b0f4955-2c3f-418c-9226-9be8f87621e8"],
+Cell[19366, 568, 1027, 27, 67, "Output",ExpressionUUID->"edd0619f-6f12-4070-a1d2-6b547877fadc"]
+}, Open  ]],
+Cell[CellGroupData[{
+Cell[20430, 600, 1543, 46, 114, "Input",ExpressionUUID->"fb45123c-c610-4075-99b0-7cd71c728ae7"],
+Cell[21976, 648, 1311, 32, 98, "Output",ExpressionUUID->"2ae14565-b412-4dc0-9dce-bd6c1ba5ef27"]
+}, Open  ]],
+Cell[23302, 683, 179, 3, 35, "Text",ExpressionUUID->"af247231-a58d-417b-987a-26908dafffdb"],
+Cell[23484, 688, 2175, 65, 94, "Input",ExpressionUUID->"7c44cadd-e488-4f51-87d8-c64eef11f40c"],
+Cell[25662, 755, 193, 3, 35, "Text",ExpressionUUID->"856f1746-1107-4509-a5ce-ac9c7f56cdb1"],
+Cell[CellGroupData[{
+Cell[25880, 762, 536, 16, 30, "Input",ExpressionUUID->"8674484a-8543-434f-b177-3b27f9353212"],
+Cell[26419, 780, 1705, 35, 87, "Output",ExpressionUUID->"c3b3f84d-91f6-41af-af6b-a394ca020511"]
+}, Open  ]],
+Cell[28139, 818, 170, 3, 35, "Text",ExpressionUUID->"518a3040-54b1-4d43-8947-5c7d12efa94d"],
+Cell[CellGroupData[{
+Cell[28334, 825, 536, 14, 30, "Input",ExpressionUUID->"61a2e974-2b39-4a07-8043-2dfd39a70569"],
+Cell[28873, 841, 6754, 167, 303, "Output",ExpressionUUID->"73480ac0-3043-4077-80cc-b952a94c822a"]
 }, Open  ]]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[21669, 585, 166, 2, 64, "Section"],
-Cell[21838, 589, 128, 2, 38, "Text"],
-Cell[21969, 593, 2923, 52, 705, "Input"],
-Cell[24895, 647, 164, 3, 38, "Text"],
-Cell[25062, 652, 4876, 148, 684, "Input"],
-Cell[29941, 802, 2588, 55, 201, "Input"],
-Cell[32532, 859, 149, 2, 38, "Text"],
-Cell[32684, 863, 15249, 396, 2133, "Input"],
-Cell[47936, 1261, 137, 2, 38, "Text"],
-Cell[48076, 1265, 521, 12, 32, "Input"]
+Cell[35676, 1014, 226, 4, 67, "Section",ExpressionUUID->"4e833cd6-9f0e-4aa3-a873-3d579e874720"],
+Cell[35905, 1020, 188, 4, 44, "Text",ExpressionUUID->"6d27fc04-3a60-4e03-8df7-3dd3aeee35b4"],
+Cell[36096, 1026, 2980, 53, 703, "Input",ExpressionUUID->"c7103bd6-b539-4495-b98c-d4d12ac6cad8"],
+Cell[39079, 1081, 221, 4, 44, "Text",ExpressionUUID->"0625593d-290f-4a39-9d80-8e2c6fdbc94e"],
+Cell[39303, 1087, 4936, 150, 682, "Input",ExpressionUUID->"1ad4904c-352f-4b1d-a7c7-91e1b0549409"],
+Cell[44242, 1239, 2645, 56, 199, "Input",ExpressionUUID->"0221674f-9b63-4662-91bc-ccc8c6ae9589"],
+Cell[46890, 1297, 209, 4, 44, "Text",ExpressionUUID->"d2d2257a-487b-416f-bc40-abd4482225f7"],
+Cell[47102, 1303, 15306, 397, 2131, "Input",ExpressionUUID->"daea68a9-c9e8-46ab-9bc8-5186e2cf477c"],
+Cell[62411, 1702, 137, 2, 44, "Text",ExpressionUUID->"76ba9d5a-7ee3-4888-be7e-6377003275e8"],
+Cell[62551, 1706, 521, 12, 30, "Input",ExpressionUUID->"4ec61f4c-3fd3-49ea-b5ef-6f7f04a16b34"]
 }, Open  ]],
 Cell[CellGroupData[{
-Cell[48634, 1282, 167, 2, 64, "Section"],
-Cell[48804, 1286, 5693, 122, 831, "Input"],
-Cell[54500, 1410, 448, 10, 32, "Input"],
-Cell[54951, 1422, 94, 1, 32, "Input"]
+Cell[63109, 1723, 167, 2, 67, "Section",ExpressionUUID->"a4458b3a-09b5-4e36-a1fc-781d6702b2dc"],
+Cell[63279, 1727, 5693, 122, 829, "Input",ExpressionUUID->"b1b309f8-a3a7-4081-a781-c3845e3cd372"],
+Cell[68975, 1851, 448, 10, 30, "Input",ExpressionUUID->"cba42949-b0f2-42ce-aebd-ffadfd83ef88"],
+Cell[69426, 1863, 94, 1, 30, "Input",ExpressionUUID->"6175b72c-af9f-43c2-b4ca-bd84c48a456d"]
 }, Open  ]]
 }
 ]
diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h
new file mode 100644
index 00000000..c6ec8150
--- /dev/null
+++ b/Grid/qcd/utils/A2Autils.h
@@ -0,0 +1,1404 @@
+#pragma once
+//#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+
+namespace Grid {
+namespace QCD {
+
+#undef DELTA_F_EQ_2
+
+template <typename FImpl>
+class A2Autils 
+{
+public:
+  typedef typename FImpl::ComplexField ComplexField;
+  typedef typename FImpl::FermionField FermionField;
+  typedef typename FImpl::PropagatorField PropagatorField;
+
+  typedef typename FImpl::SiteSpinor vobj;
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  typedef iSpinMatrix<vector_type> SpinMatrix_v;
+  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+  typedef iSinglet<vector_type> Scalar_v;
+  typedef iSinglet<scalar_type> Scalar_s;
+
+  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
+
+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void MesonField(TensorType &mat, 
+			 const FermionField *lhs_wi,
+			 const FermionField *rhs_vj,
+			 std::vector<Gamma::Algebra> gammas,
+			 const std::vector<ComplexField > &mom,
+			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+
+  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
+			     const FermionField *wi,
+			     const FermionField *vj,
+			     const std::vector<ComplexField > &mom,
+			     int orthogdim);
+
+  static void PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *wi,
+			  const FermionField *vj,
+			  int orthogdim,
+			  int g5);
+  
+  static void PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *wi,
+			  const FermionField *vj,
+			  int orthogdim);
+  static void PionFieldWW(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *wi,
+			  const FermionField *wj,
+			  int orthogdim);
+  static void PionFieldVV(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *vi,
+			  const FermionField *vj,
+			  int orthogdim);
+
+  template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
+  static void AslashField(TensorType &mat, 
+        const FermionField *lhs_wi,
+        const FermionField *rhs_vj,
+        const std::vector<ComplexField> &emB0,
+        const std::vector<ComplexField> &emB1,
+        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+
+  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const Eigen::Tensor<ComplexD,3> &WW_sd,
+			   const FermionField *vs,
+			   const FermionField *vd);
+
+  static void ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,
+					      const PropagatorField &WWVV1,
+					      const std::vector<Gamma> &gamma0,
+					      const std::vector<Gamma> &gamma1,
+					      ComplexField &O_trtr,
+					      ComplexField &O_fig8);
+
+  static void ContractFourQuarkColourMix(const PropagatorField &WWVV0,
+					 const PropagatorField &WWVV1,
+					 const std::vector<Gamma> &gamma0,
+					 const std::vector<Gamma> &gamma1,
+					 ComplexField &O_trtr,
+					 ComplexField &O_fig8);
+#ifdef DELTA_F_EQ_2
+  static void DeltaFeq2(int dt_min,int dt_max,
+			Eigen::Tensor<ComplexD,2> &dF2_fig8,
+			Eigen::Tensor<ComplexD,2> &dF2_trtr,
+			Eigen::Tensor<ComplexD,2> &dF2_fig8_mix,
+			Eigen::Tensor<ComplexD,2> &dF2_trtr_mix,
+			Eigen::Tensor<ComplexD,1> &denom_A0,
+			Eigen::Tensor<ComplexD,1> &denom_P,
+			Eigen::Tensor<ComplexD,3> &WW_sd, 
+			const FermionField *vs,
+			const FermionField *vd,
+			int orthogdim);
+#endif
+};
+
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::MesonField(TensorType &mat, 
+				 const FermionField *lhs_wi,
+				 const FermionField *rhs_vj,
+				 std::vector<Gamma::Algebra> gammas,
+				 const std::vector<ComplexField > &mom,
+				 int orthogdim, double *t_kernel, double *t_gsum) 
+{
+  typedef typename FImpl::SiteSpinor vobj;
+
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  typedef iSpinMatrix<vector_type> SpinMatrix_v;
+  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+  
+  int Lblock = mat.dimension(3); 
+  int Rblock = mat.dimension(4);
+
+  GridBase *grid = lhs_wi[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+  int Nmom   = mom.size();
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock*Nmom;
+  int MFlvol = ld*Lblock*Rblock*Nmom;
+
+  Vector<SpinMatrix_v > lvSum(MFrvol);
+  thread_loop( (int r = 0; r < MFrvol; r++),{
+    lvSum[r] = Zero();
+  });
+
+  Vector<SpinMatrix_s > lsSum(MFlvol);             
+  thread_loop( (int r = 0; r < MFlvol; r++),{
+    lsSum[r]=scalar_type(0.0);
+  });
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // potentially wasting cores here if local time extent too small
+  if (t_kernel) *t_kernel = -usecond();
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int ss= so+n*stride+b;
+
+	for(int i=0;i<Lblock;i++){
+
+	  auto lhs_v = lhs_wi[i].View();
+	  auto left = conjugate(lhs_v[ss]);
+
+	  for(int j=0;j<Rblock;j++){
+
+	    SpinMatrix_v vv;
+	    auto rhs_v = rhs_vj[j].View();
+	    auto right = rhs_v[ss];
+	    for(int s1=0;s1<Ns;s1++){
+	    for(int s2=0;s2<Ns;s2++){
+	      vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		+             left()(s2)(1) * right()(s1)(1)
+		+             left()(s2)(2) * right()(s1)(2);
+	    }}
+	    
+	    // After getting the sitewise product do the mom phase loop
+	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
+	    for ( int m=0;m<Nmom;m++){
+	      int idx = m+base;
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
+	      mac(&lvSum[idx],&vv,&phase);
+	    }
+	  
+	  }
+	}
+      }
+    }
+  });
+
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  thread_loop( (int rt=0;rt<rd;rt++),{
+
+    Coordinate icoor(Nd);
+    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int m=0;m<Nmom;m++){
+
+      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
+
+      extract(lvSum[ij_rdx],extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+
+      }
+    }}}
+  });
+  if (t_kernel) *t_kernel += usecond();
+  assert(mat.dimension(0) == Nmom);
+  assert(mat.dimension(1) == Ngamma);
+  assert(mat.dimension(2) == Nt);
+
+  // ld loop and local only??
+  int pd = grid->_processors[orthogdim];
+  int pc = grid->_processor_coor[orthogdim];
+  thread_loop_collapse(2, (int lt=0;lt<ld;lt++), 
+  {
+    for(int pt=0;pt<pd;pt++){
+      int t = lt + pt*ld;
+      if (pt == pc){
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int m=0;m<Nmom;m++){
+	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
+	      for(int mu=0;mu<Ngamma;mu++){
+		// this is a bit slow
+		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
+	      }
+	    }
+	  }
+	}
+      } else { 
+	const scalar_type zz(0.0);
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int mu=0;mu<Ngamma;mu++){
+	      for(int m=0;m<Nmom;m++){
+		mat(m,mu,t,i,j) =zz;
+	      }
+	    }
+	  }
+	}
+      }
+    }
+  });
+
+  ////////////////////////////////////////////////////////////////////
+  // This global sum is taking as much as 50% of time on 16 nodes
+  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
+  // Healthy size that should suffice
+  ////////////////////////////////////////////////////////////////////
+  if (t_gsum) *t_gsum = -usecond();
+  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
+  if (t_gsum) *t_gsum += usecond();
+}
+
+
+///////////////////////////////////////////////////////////////////
+//Meson 
+// Interested in
+//
+//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
+//
+// Conventional meson field:
+//                 
+//    = sum_x,y Trace[ sum_j G |v_j(y,ty)> <w_j(x,tx)|  G sum_i |v_i(x,tx) ><w_i(y,ty)| ]
+//    = sum_ij sum_x,y < w_j(x,tx)| G |v_i(x,tx) > <w_i(y,ty) (x)|G| v_j(y,ty) >
+//    = sum_ij PI_ji(tx) PI_ij(ty)
+//
+// G5-Hermiticity
+//
+//      sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ]
+//    = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ]
+//    = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> <w_j(x,tx)|  G g5 sum_i   (|v_j(y,ty)> <w_i(x,tx)|)^dag ]      --  (*)
+//
+// NB:  Dag applies to internal indices spin,colour,complex
+//
+//    = sum_ij sum_x,y Trace[ g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5  |w_i(x,tx)> <v_i(y,ty)| ]
+//    = sum_ij sum_x,y <v_i(y,ty)|g5 G |v_j(y,ty)> <w_j(x,tx)|  G g5 |w_i(x,tx)> 
+//    = sum_ij  PionVV(ty) PionWW(tx)
+//
+// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker
+//     expectation value. Otherwise biased.
+////////////////////////////////////////////////////////////////////
+
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *wi,
+				  const FermionField *vj,
+				  int orthogdim,
+				  int g5) 
+{
+  int Lblock = mat.dimension(1); 
+  int Rblock = mat.dimension(2);
+
+  GridBase *grid = wi[0].Grid();
+  
+  const int    nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock;
+  int MFlvol = ld*Lblock*Rblock;
+
+  Vector<vector_type > lvSum(MFrvol);
+  thread_loop(  (int r = 0; r < MFrvol; r++),{
+    lvSum[r] = Zero();
+  });
+
+  Vector<scalar_type > lsSum(MFlvol);             
+  thread_loop(  (int r = 0; r < MFlvol; r++),{
+    lsSum[r]=scalar_type(0.0);
+  });
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int ss= so+n*stride+b;
+
+	for(int i=0;i<Lblock;i++){
+
+	  auto wi_v = wi[i].View();
+	  auto w = conjugate(wi_v[ss]);
+	  if (g5) {
+	    w()(2)(0) = - w()(2)(0);
+	    w()(2)(1) = - w()(2)(1);
+	    w()(2)(2) = - w()(2)(2);
+	    w()(3)(0) = - w()(3)(0);
+	    w()(3)(1) = - w()(3)(1);
+	    w()(3)(2) = - w()(3)(2);
+	  }
+	  for(int j=0;j<Rblock;j++){
+	    
+	    auto vj_v=vj[j].View();
+	    auto v  = vj_v[ss];
+	    auto vv = v()(0)(0);
+
+	    vv =      w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
+	      +       w()(0)(1) * v()(0)(1)
+	      +       w()(0)(2) * v()(0)(2)
+	      +       w()(1)(0) * v()(1)(0)
+	      +       w()(1)(1) * v()(1)(1)
+	      +       w()(1)(2) * v()(1)(2)
+	      +       w()(2)(0) * v()(2)(0)
+	      +       w()(2)(1) * v()(2)(1)
+	      +       w()(2)(2) * v()(2)(2)
+	      +       w()(3)(0) * v()(3)(0)
+	      +       w()(3)(1) * v()(3)(1)
+	      +       w()(3)(2) * v()(3)(2);
+	    
+	    int idx = i+Lblock*j+Lblock*Rblock*r;
+	    lvSum[idx] = lvSum[idx]+vv;
+	  }
+	}
+      }
+    }
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  thread_loop( (int rt=0;rt<rd;rt++),{
+
+      Coordinate icoor(nd);
+    iScalar<vector_type> temp; 
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+
+      int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
+
+      temp._internal =lvSum[ij_rdx];
+      extract(temp,extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
+
+      }
+    }}
+  });
+
+  assert(mat.dimension(0) == Nt);
+  // ld loop and local only??
+  int pd = grid->_processors[orthogdim];
+  int pc = grid->_processor_coor[orthogdim];
+  thread_loop_collapse(2,(int lt=0;lt<ld;lt++),
+  {
+    for(int pt=0;pt<pd;pt++){
+      int t = lt + pt*ld;
+      if (pt == pc){
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
+	    mat(t,i,j) = lsSum[ij_dx];
+	  }
+	}
+      } else { 
+	const scalar_type zz(0.0);
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    mat(t,i,j) =zz;
+	  }
+	}
+      }
+    }
+  });
+
+  grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
+}
+
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
+				     const FermionField *wi,
+				     const FermionField *vj,
+				     const std::vector<ComplexField > &mom,
+				     int orthogdim) 
+{
+  int Lblock = mat.dimension(2); 
+  int Rblock = mat.dimension(3);
+
+  GridBase *grid = wi[0].Grid();
+  
+  const int    nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Nmom   = mom.size();
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock*Nmom;
+  int MFlvol = ld*Lblock*Rblock*Nmom;
+
+  Vector<vector_type > lvSum(MFrvol);
+  thread_loop(  (int r = 0; r < MFrvol; r++),{
+    lvSum[r] = Zero();
+  });
+
+  Vector<scalar_type > lsSum(MFlvol);             
+  thread_loop(  (int r = 0; r < MFlvol; r++),{
+    lsSum[r]=scalar_type(0.0);
+  });
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int ss= so+n*stride+b;
+
+	for(int i=0;i<Lblock;i++){
+
+	  auto wi_v = wi[i].View();
+	  auto w = conjugate(wi_v[ss]);
+
+	  for(int j=0;j<Rblock;j++){
+	    
+	    auto vj_v = vj[j].View();
+	    auto v = vj_v[ss];
+
+	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
+	      +       w()(0)(1) * v()(0)(1)
+	      +       w()(0)(2) * v()(0)(2)
+	      +       w()(1)(0) * v()(1)(0)
+	      +       w()(1)(1) * v()(1)(1)
+	      +       w()(1)(2) * v()(1)(2)
+	      -       w()(2)(0) * v()(2)(0)
+	      -       w()(2)(1) * v()(2)(1)
+	      -       w()(2)(2) * v()(2)(2)
+	      -       w()(3)(0) * v()(3)(0)
+	      -       w()(3)(1) * v()(3)(1)
+	      -       w()(3)(2) * v()(3)(2);
+
+	    
+	    // After getting the sitewise product do the mom phase loop
+	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
+	    for ( int m=0;m<Nmom;m++){
+	      int idx = m+base;
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
+	      mac(&lvSum[idx],&vv,&phase()()());
+	    }
+	  }
+	}
+      }
+    }
+  });
+
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  thread_loop( (int rt=0;rt<rd;rt++),{
+
+    Coordinate icoor(nd);
+    iScalar<vector_type> temp; 
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int m=0;m<Nmom;m++){
+
+      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
+
+      temp._internal = lvSum[ij_rdx];
+      extract(temp,extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
+
+      }
+    }}}
+  });
+
+  assert(mat.dimension(0) == Nmom);
+  assert(mat.dimension(1) == Nt);
+
+  int pd = grid->_processors[orthogdim];
+  int pc = grid->_processor_coor[orthogdim];
+  thread_loop_collapse(2,(int lt=0;lt<ld;lt++),
+  {
+    for(int pt=0;pt<pd;pt++){
+      int t = lt + pt*ld;
+      if (pt == pc){
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int m=0;m<Nmom;m++){
+	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
+	      mat(m,t,i,j) = lsSum[ij_dx];
+	    }
+	  }
+	}
+      } else { 
+	const scalar_type zz(0.0);
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int m=0;m<Nmom;m++){
+	      mat(m,t,i,j) =zz;
+	    }
+	  }
+	}
+      }
+    }
+  });
+
+  grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
+}
+
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *wi,
+				  const FermionField *vj,
+				  int orthogdim) 
+{
+  const int g5=1;
+  PionFieldXX(mat,wi,vj,orthogdim,g5);
+}
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldWW(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *wi,
+				  const FermionField *wj,
+				  int orthogdim) 
+{
+  const int nog5=0;
+  PionFieldXX(mat,wi,wj,orthogdim,nog5);
+}
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *vi,
+				  const FermionField *vj,
+				  int orthogdim) 
+{
+  const int nog5=0;
+  PionFieldXX(mat,vi,vj,orthogdim,nog5);
+}
+
+// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
+//
+// With:
+//
+// B_0 = A_0 + i A_1
+// B_1 = A_2 + i A_3
+// 
+// then in spin space
+// 
+//                 ( 0          0          -conj(B_1) -B_0 )
+// i * A_mu g_mu = ( 0          0          -conj(B_0)  B_1 )
+//                 ( B_1        B_0        0          0    )
+//                 ( conj(B_0)  -conj(B_1) 0          0    )
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::AslashField(TensorType &mat, 
+          const FermionField *lhs_wi,
+          const FermionField *rhs_vj,
+          const std::vector<ComplexField> &emB0,
+          const std::vector<ComplexField> &emB1,
+          int orthogdim, double *t_kernel, double *t_gsum) 
+{
+    typedef typename FermionField::vector_object vobj;
+    typedef typename vobj::scalar_object         sobj;
+    typedef typename vobj::scalar_type           scalar_type;
+    typedef typename vobj::vector_type           vector_type;
+
+    typedef iSpinMatrix<vector_type> SpinMatrix_v;
+    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+    typedef iSinglet<vector_type>    Singlet_v;
+    typedef iSinglet<scalar_type>    Singlet_s;
+    
+    int Lblock = mat.dimension(3); 
+    int Rblock = mat.dimension(4);
+
+    GridBase *grid = lhs_wi[0].Grid();
+    
+    const int    Nd = grid->_ndimension;
+    const int Nsimd = grid->Nsimd();
+
+    int Nt  = grid->GlobalDimensions()[orthogdim];
+    int Nem = emB0.size();
+    assert(emB1.size() == Nem);
+
+    int fd=grid->_fdimensions[orthogdim];
+    int ld=grid->_ldimensions[orthogdim];
+    int rd=grid->_rdimensions[orthogdim];
+
+    // will locally sum vectors first
+    // sum across these down to scalars
+    // splitting the SIMD
+    int MFrvol = rd*Lblock*Rblock*Nem;
+    int MFlvol = ld*Lblock*Rblock*Nem;
+
+    Vector<vector_type> lvSum(MFrvol);
+    thread_loop(  (int r = 0; r < MFrvol; r++),
+    {
+      lvSum[r] = Zero();
+    });
+
+    Vector<scalar_type> lsSum(MFlvol);             
+    thread_loop(  (int r = 0; r < MFlvol; r++),
+    {
+        lsSum[r] = scalar_type(0.0);
+    });
+
+    int e1=    grid->_slice_nblock[orthogdim];
+    int e2=    grid->_slice_block [orthogdim];
+    int stride=grid->_slice_stride[orthogdim];
+
+    // Nested parallelism would be ok
+    // Wasting cores here. Test case r
+    if (t_kernel) *t_kernel = -usecond();
+    thread_loop( (int r=0;r<rd;r++),
+    {
+        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+        for(int n=0;n<e1;n++)
+        for(int b=0;b<e2;b++)
+        {
+            int ss= so+n*stride+b;
+
+            for(int i=0;i<Lblock;i++)
+            {
+  	        auto wi_v = lhs_wi[i].View();
+                auto left = conjugate(wi_v[ss]);
+
+                for(int j=0;j<Rblock;j++)
+                {
+                    SpinMatrix_v vv;
+		    auto vj_v  = rhs_vj[j].View();
+                    auto right = vj_v[ss];
+
+                    for(int s1=0;s1<Ns;s1++)
+                    for(int s2=0;s2<Ns;s2++)
+                    {
+		          vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+                                        + left()(s2)(1) * right()(s1)(1)
+                                        + left()(s2)(2) * right()(s1)(2);
+                    }
+                    
+                    // After getting the sitewise product do the mom phase loop
+                    int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
+
+                    for ( int m=0;m<Nem;m++)
+                    {
+  		        auto emB0_v = emB0[m].View();
+  		        auto emB1_v = emB1[m].View();
+                        int idx  = m+base;
+                        auto b0  = emB0_v[ss];
+                        auto b1  = emB1_v[ss];
+                        auto cb0 = conjugate(b0);
+                        auto cb1 = conjugate(b1);
+
+                        lvSum[idx] += - vv()(3,0)()*b0()()()  - vv()(2,0)()*cb1()()()
+                                      + vv()(3,1)()*b1()()()  - vv()(2,1)()*cb0()()()
+                                      + vv()(0,2)()*b1()()()  + vv()(1,2)()*b0()()()
+                                      + vv()(0,3)()*cb0()()() - vv()(1,3)()*cb1()()();
+                    }
+                }
+            }
+        }
+    });
+
+    // Sum across simd lanes in the plane, breaking out orthog dir.
+    thread_loop( (int rt=0;rt<rd;rt++),
+    {
+        Coordinate icoor(Nd);
+        ExtractBuffer<scalar_type> extracted(Nsimd);               
+
+        for(int i=0;i<Lblock;i++)
+        for(int j=0;j<Rblock;j++)
+        for(int m=0;m<Nem;m++)
+        {
+
+            int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
+
+            extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
+            for(int idx=0;idx<Nsimd;idx++)
+            {
+                grid->iCoorFromIindex(icoor,idx);
+
+                int ldx    = rt+icoor[orthogdim]*rd;
+                int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
+
+                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+            }
+        }
+    });
+    if (t_kernel) *t_kernel += usecond();
+
+    // ld loop and local only??
+    int pd = grid->_processors[orthogdim];
+    int pc = grid->_processor_coor[orthogdim];
+    thread_loop_collapse(2,(int lt=0;lt<ld;lt++),
+    {
+        for(int pt=0;pt<pd;pt++)
+        {
+            int t = lt + pt*ld;
+            if (pt == pc)
+            {
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nem;m++)
+                {
+                    int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
+
+                    mat(m,0,t,i,j) = lsSum[ij_dx];
+                }
+            } 
+            else 
+            { 
+                const scalar_type zz(0.0);
+
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nem;m++)
+                {
+                    mat(m,0,t,i,j) = zz;
+                }
+            }
+        }
+    });
+    if (t_gsum) *t_gsum = -usecond();
+    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
+    if (t_gsum) *t_gsum += usecond();
+}
+
+////////////////////////////////////////////
+// Schematic thoughts about more generalised four quark insertion
+//
+// --  Pupil or fig8 type topology (depending on flavour structure) done below
+// --  Have  Bag style   --  WWVV  VVWW
+//            _  _      
+//           / \/ \
+//           \_/\_/   
+//                                                                                                                                                                       
+//  -   Kpipi style (two pion insertions) 
+//                                K     
+// *********
+// Type 1                    
+// *********
+//                x
+//            ___  _____ pi(b)  
+//        K  /   \/____/          
+//           \    \
+//            \____\pi(a)   
+//
+//                        
+//        W^W_sd V_s(x)  V^_d(xa) Wa(xa) Va^(x)    WW_bb' Vb Vb'(x)
+//
+//        (Kww.PIvw) VV ;   pi_ww.VV
+// 
+//  But Norman and Chris say g5 hermiticity not used, except on K
+//
+//        Kww    PIvw     PIvw       
+//
+//        (W^W_sd PIa_dd')   PIb_uu'  (v_s v_d' w_u v_u')(x)
+// 
+//  - Can form (Nmode^3)
+//
+//   (Kww . PIvw)_sd and then V_sV_d  tensor product contracting this
+// 
+//  - Can form 
+//
+//   (PIvw)_uu' W_uV_u'  tensor product.
+//
+// Both are lattice propagators.
+//
+// Contract with the same four quark type contraction as BK.
+//
+// *********
+// Type 2
+// *********
+//            _  _____ pi 
+//        K  / \/     |   
+//           \_/\_____|pi 
+//
+// Norman/Chris would say
+//
+//  (Kww VV)(x) . (PIwv Piwv) VW(x)
+//
+// Full four quark via g5 hermiticity
+//
+//  (Kww VV)(x) . (PIww Pivw) VV(x)
+//
+// *********
+// Type 3
+// *********
+//            ___  _____ pi 
+//        K  /   \/     |   
+//           \   /\     |
+//            \  \/     |
+//             \________|pi 
+// 
+// 
+//   (V(x) . Kww . pi_vw . pivw . V(x)). WV(x)
+//
+// No difference possible to Norman, Chris, Diaqian
+//
+// *********
+// Type 4
+// *********
+//            ___        pi 
+//        K  /   \/\     |\
+//           \___/\/     |/
+//                        pi
+//            
+//  (Kww VV ) WV        x   Tr(PIwv PIwv)
+//        
+// Could use alternate PI_ww PIvv for disco loop interesting comparison
+//
+// *********
+// Primitives / Utilities for assembly
+// *********
+// 
+// i)   Basic type for meson field - mode indexed object, whether WW, VW, VV etc..
+// ii)  Multiply two meson fields (modes^3) ; use BLAS MKL via Eigen
+// iii) Multiply and trace two meson fields ; use BLAS MKL via Eigen. The element wise product trick
+// iv)  Contract a meson field (whether WW,VW, VV WV) with W and V fields to form LatticePropagator
+// v)   L^3 sum a four quark contaction with two LatticePropagators.
+//
+// Use lambda functions to enable flexibility in v)
+// Use lambda functions to unify the pion field / meson field contraction codes.
+////////////////////////////////////////////
+
+
+
+////////////////////////////////////////////////////////////////////////
+// DeltaF=2 contraction ; use exernal WW field for Kaon, anti Kaon sink
+////////////////////////////////////////////////////////////////////////
+//
+// WW -- i vectors have adjoint, and j vectors not. 
+//    -- Think of "i" as the strange quark, forward prop from 0
+//    -- Think of "j" as the anti-down quark.
+//
+// WW_sd are w^dag_s w_d
+//
+// Hence VV vectors correspondingly are  v^dag_d,  v_s    from t=0
+//                                  and  v^dag_d,  v_s    from t=dT
+//
+// There is an implicit g5 associated with each v^dag_d from use of g5 Hermiticity.
+// The other gamma_5 lies in the WW external meson operator.
+//
+// From UKhadron wallbag.cc:
+//
+//   LatticePropagator anti_d0 =  adj( Gamma(G5) * Qd0 * Gamma(G5));
+//   LatticePropagator anti_d1 =  adj( Gamma(G5) * Qd1 * Gamma(G5));
+//
+//   PR1 = Qs0 * Gamma(G5) * anti_d0;
+//   PR2 = Qs1 * Gamma(G5) * anti_d1;
+//
+//   TR1 = trace( PR1 * G1 );
+//   TR2 = trace( PR2 * G2 );
+//   Wick1 = TR1 * TR2;
+//
+//   Wick2 = trace( PR1* G1 * PR2 * G2 );
+//   // was      Wick2 = trace( Qs0 * Gamma(G5) * anti_d0 * G1 * Qs1 * Gamma(G5) * anti_d1 * G2 );
+//
+// TR TR(tx) = Wick1 = sum_x WW[t0]_sd < v^_d |g5 G| v_s>   WW[t1]_s'd' < v^_d' |g5 G| v_s'> |_{x,tx)
+//           = sum_x [ Trace(WW[t0] VgV(t,x) )  x Trace( WW_[t1] VgV(t,x) ) ]
+//           
+//
+// Calc all Nt Trace(WW VV) products at once, take Nt^2 products of these.
+//
+// Fig8(tx)  = Wick2 = sum_x WW[t0]_sd WW[t1]_s'd'  < v^_d |g5 G| v_s'> < v^_d' |g5 G| v_s> |_{x,tx}
+//
+//                   = sum_x Trace( WW[t0] VV[t,x] WW[t1] VV[t,x] )
+// 
+///////////////////////////////////////////////////////////////////////////////
+//
+// WW is w_s^dag (x) w_d       (G5 implicitly absorbed)
+//
+// WWVV will have spin-col (x) spin-col tensor.
+//
+// Want this to look like a strange fwd prop, anti-d prop.
+// 
+// Take WW_sd v^dag_d (x) v_s
+// 
+
+template<class FImpl>
+void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+				   const Eigen::Tensor<ComplexD,3> &WW_sd,
+				   const FermionField *vs,
+				   const FermionField *vd)
+{
+  GridBase *grid = vs[0].Grid();
+
+  int nd    = grid->_ndimension;
+  int Nsimd = grid->Nsimd();
+  int N_t   = WW_sd.dimension(0);
+  int N_s = WW_sd.dimension(1); 
+  int N_d = WW_sd.dimension(2);
+
+  int d_unroll = 32;// Empirical optimisation
+
+  for(int t=0;t<N_t;t++){
+    WWVV[t] = Zero();
+  }
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss++),{
+    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
+      for(int t=0;t<N_t;t++){
+      for(int s=0;s<N_s;s++){
+	auto vs_v = vs[s].View();
+	auto tmp1 = vs_v[ss];
+	vobj tmp2 = Zero();
+
+	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+	  Scalar_v coeff = WW_sd(t,s,d);
+	  auto vd_v = vd[d].View();
+	  mac(&tmp2 ,& coeff, & vd_v[ss]);
+	}
+
+	//////////////////////////
+	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+	//////////////////////////
+	tmp2 = conjugate(tmp2);
+	auto WWVV_v = WWVV[t].View();
+	for(int s1=0;s1<Ns;s1++){
+	for(int s2=0;s2<Ns;s2++){
+	  
+	  WWVV_v[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
+	  WWVV_v[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
+	  WWVV_v[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
+	  WWVV_v[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
+	  WWVV_v[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
+	  WWVV_v[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
+	  WWVV_v[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
+	  WWVV_v[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
+	  WWVV_v[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
+	}}
+
+      }}
+    }
+  });
+}
+
+
+template<class FImpl>
+void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,
+						      const PropagatorField &WWVV1,
+						      const std::vector<Gamma> &gamma0,
+						      const std::vector<Gamma> &gamma1,
+						      ComplexField &O_trtr,
+						      ComplexField &O_fig8)
+{
+  assert(gamma0.size()==gamma1.size());
+  int Ng = gamma0.size();
+
+  GridBase *grid = WWVV0.Grid();
+
+  auto WWVV0_v = WWVV0.View();
+  auto WWVV1_v = WWVV1.View();
+  auto O_trtr_v= O_trtr.View();
+  auto O_fig8_v= O_fig8.View();
+  thread_loop( (int ss=0;ss<grid->oSites();ss++),{
+
+    typedef typename ComplexField::vector_object vobj;
+
+    vobj v_trtr;
+    vobj v_fig8;
+
+    auto VV0 = WWVV0_v[ss];
+    auto VV1 = WWVV1_v[ss];
+    
+    for(int g=0;g<Ng;g++){
+
+      v_trtr = trace(VV0 * gamma0[g])* trace(VV1*gamma1[g]);
+      v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);
+
+      if ( g==0 ) {
+	O_trtr_v[ss] = v_trtr; 
+	O_fig8_v[ss] = v_fig8;
+      } else { 
+	O_trtr_v[ss]+= v_trtr; 
+	O_fig8_v[ss]+= v_fig8;
+      }
+      
+    }
+  });
+}
+
+template<class FImpl>
+void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
+						 const PropagatorField &WWVV1,
+						 const std::vector<Gamma> &gamma0,
+						 const std::vector<Gamma> &gamma1,
+						 ComplexField &O_trtr,
+						 ComplexField &O_fig8)
+{
+  assert(gamma0.size()==gamma1.size());
+  int Ng = gamma0.size();
+
+  GridBase *grid = WWVV0.Grid();
+
+  auto WWVV0_v = WWVV0.View();
+  auto WWVV1_v = WWVV1.View();
+  auto O_trtr_v= O_trtr.View();
+  auto O_fig8_v= O_fig8.View();
+
+  thread_loop( (int ss=0;ss<grid->oSites();ss++),{
+
+    typedef typename ComplexField::vector_object vobj;
+
+    auto VV0 = WWVV0_v[ss];
+    auto VV1 = WWVV1_v[ss];
+    
+    for(int g=0;g<Ng;g++){
+
+      auto VV0G = VV0 * gamma0[g];  // Spin multiply
+      auto VV1G = VV1 * gamma1[g];
+
+      vobj v_trtr=Zero();
+      vobj v_fig8=Zero();
+
+      /////////////////////////////////////////
+      // Colour mixed
+      /////////////////////////////////////////
+      // _                 _
+      // s_sa G_st d_tb    s_s'b  G_s't' d_t'a
+      // 
+      //                                         
+      // Contracted with prop factor (VV0)_sd,ab (VV1)_s'd',ba
+      //
+      // Wick1 [ spin TR TR ]
+      //
+      //    (VV0*G0)_ss,ba .  (VV1*G1)_tt,ab
+       //
+      // Wick2 [ spin fig8 ]
+      //
+      //    (VV0*G0)_st,aa (VV1*G1)_ts,bb
+      // 
+      /////////////////////////////////////////
+
+      for(int a=0;a<Nc;a++){
+      for(int b=0;b<Nc;b++){
+      for(int s=0;s<Ns;s++){
+      for(int t=0;t<Ns;t++){
+	// Mixed traces
+	v_trtr()()() += VV0G()(s,s)(a,b)*VV1G()(t,t)(b,a); // Was the fig8 before Fierzing
+	v_fig8()()() += VV0G()(s,t)(a,a)*VV1G()(t,s)(b,b); // Was the trtr before Fierzing
+
+	/*
+	 * CHECKS -- use Fierz identities as a strong test, 4 Oct 2018.
+	 * 
+BagMix [8,0]  fig8 (21.5596,-3.83908e-17) trtr (0.064326,2.51001e-17) // Fierz        -1 0   0 0 0    
+BagMix [8,1]  fig8 (-1346.99,1.2481e-16) trtr (34.2501,-3.36935e-17)  //               0 0   2 0 0 
+BagMix [8,2]  fig8 (13.7536,-6.04625e-19) trtr (-215.542,3.24326e-17) //               0 1/2 0 0 0
+BagMix [8,3]  fig8 (555.878,-7.39942e-17) trtr (463.82,-4.73909e-17)  //               0 0   0 1/2 -1/2  
+BagMix [8,4]  fig8 (-1602.48,9.08511e-17) trtr (-936.302,1.14156e-16) //               0 0   0 -3/2 -1/2
+
+Bag [8,0]  fig8 (-0.064326,1.06281e-17) trtr (-21.5596,1.06051e-17)   
+Bag [8,2]  fig8 (17.125,-3.40959e-17) trtr (-673.493,7.68134e-17)   
+Bag [8,1]  fig8 (-431.084,2.76423e-17) trtr (27.5073,-5.76967e-18)    /////////// TR TR                           FIG8
+Bag [8,3]  fig8 (700.061,-1.14925e-16) trtr (1079.18,-1.35476e-16)    //    555.878 = 0.5(1079.18+32.5776) ;   463.82     =0.5(700.061+227.58)
+Bag [8,4]  fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17)     //  - 1602.48 = - 1.5*1079.18 + .5* 32.5776; 936.302=-1.5* 700+0.5*227
+	*/
+	
+	//Unmixed debug check consistency
+	//	v_trtr()()() += VV0G()(s,s)(a,a)*VV1G()(t,t)(b,b);
+	//	v_fig8()()() += VV0G()(s,t)(a,b)*VV1G()(t,s)(b,a);
+      }}}}
+
+      if ( g==0 ) {
+	O_trtr_v[ss] = v_trtr; 
+	O_fig8_v[ss] = v_fig8;
+      } else { 
+	O_trtr_v[ss]+= v_trtr; 
+	O_fig8_v[ss]+= v_fig8;
+      }
+      
+    }
+  });
+}
+
+#ifdef DELTA_F_EQ_2
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Perhaps this should move out of the utils and into Hadrons module
+// Now makes use of the primitives above and doesn't touch inside 
+// the lattice structures.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class FImpl>
+void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
+				Eigen::Tensor<ComplexD,2> &dF2_fig8,
+				Eigen::Tensor<ComplexD,2> &dF2_trtr,
+				Eigen::Tensor<ComplexD,2> &dF2_fig8_mix,
+				Eigen::Tensor<ComplexD,2> &dF2_trtr_mix,
+				Eigen::Tensor<ComplexD,1> &denom_A0,
+				Eigen::Tensor<ComplexD,1> &denom_P,
+				Eigen::Tensor<ComplexD,3> &WW_sd, 
+				const FermionField *vs,
+				const FermionField *vd,
+				int orthogdim)
+{
+  GridBase *grid = vs[0].Grid();
+
+  LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl;
+
+  auto G5 = Gamma(Gamma::Algebra::Gamma5);
+  
+  double nodes = grid->NodeCount();
+  int nd    = grid->_ndimension;
+  int Nsimd = grid->Nsimd();
+  int N_t = WW_sd.dimension(0); 
+  int N_s = WW_sd.dimension(1); 
+  int N_d = WW_sd.dimension(2);
+
+  assert(grid->GlobalDimensions()[orthogdim] == N_t);
+  double vol         = 1.0;
+  for(int dim=0;dim<nd;dim++){
+    vol = vol * grid->GlobalDimensions()[dim];
+  }
+
+  double t_tot  = -usecond();
+  std::vector<PropagatorField> WWVV (N_t,grid);
+
+  double t_outer= -usecond();
+  ContractWWVV(WWVV,WW_sd,&vs[0],&vd[0]);
+  t_outer+=usecond();
+
+  //////////////////////////////
+  // Implicit gamma-5 
+  //////////////////////////////
+  for(int t=0;t<N_t;t++){
+    WWVV[t] = WWVV[t]* G5 ; 
+  }
+
+  ////////////////////////////////////////////////////////
+  // Contraction
+  ////////////////////////////////////////////////////////
+  int Ng=5;
+  dF2_trtr.resize(N_t,Ng);
+  dF2_fig8.resize(N_t,Ng);
+  dF2_trtr_mix.resize(N_t,Ng);
+  dF2_fig8_mix.resize(N_t,Ng);
+  denom_A0.resize(N_t);
+  denom_P.resize(N_t);
+  for(int t=0;t<N_t;t++){
+    for(int g=0;g<Ng;g++) dF2_trtr(t,g)= ComplexD(0.0);
+    for(int g=0;g<Ng;g++) dF2_fig8(t,g)= ComplexD(0.0);
+    for(int g=0;g<Ng;g++) dF2_trtr_mix(t,g)= ComplexD(0.0);
+    for(int g=0;g<Ng;g++) dF2_fig8_mix(t,g)= ComplexD(0.0);
+    denom_A0(t) =ComplexD(0.0);
+    denom_P(t) =ComplexD(0.0);
+  }
+
+  ComplexField D0(grid);   D0 = Zero(); // <P|A0> correlator from each wall
+  ComplexField D1(grid);   D1 = Zero();
+
+  ComplexField O1_trtr(grid);  O1_trtr = Zero();
+  ComplexField O2_trtr(grid);  O2_trtr = Zero();
+  ComplexField O3_trtr(grid);  O3_trtr = Zero();
+  ComplexField O4_trtr(grid);  O4_trtr = Zero();
+  ComplexField O5_trtr(grid);  O5_trtr = Zero();
+
+  ComplexField O1_fig8(grid);  O1_fig8 = Zero();
+  ComplexField O2_fig8(grid);  O2_fig8 = Zero();
+  ComplexField O3_fig8(grid);  O3_fig8 = Zero();
+  ComplexField O4_fig8(grid);  O4_fig8 = Zero();
+  ComplexField O5_fig8(grid);  O5_fig8 = Zero();
+
+  ComplexField VV_trtr(grid);  VV_trtr = Zero();
+  ComplexField AA_trtr(grid);  AA_trtr = Zero();
+  ComplexField SS_trtr(grid);  SS_trtr = Zero();
+  ComplexField PP_trtr(grid);  PP_trtr = Zero();
+  ComplexField TT_trtr(grid);  TT_trtr = Zero();
+
+  ComplexField VV_fig8(grid);  VV_fig8 = Zero();
+  ComplexField AA_fig8(grid);  AA_fig8 = Zero();
+  ComplexField SS_fig8(grid);  SS_fig8 = Zero();
+  ComplexField PP_fig8(grid);  PP_fig8 = Zero();
+  ComplexField TT_fig8(grid);  TT_fig8 = Zero();
+
+  //////////////////////////////////////////////////
+  // Used to store appropriate correlation funcs
+  //////////////////////////////////////////////////
+  std::vector<TComplex>  C1;
+  std::vector<TComplex>  C2;
+  std::vector<TComplex>  C3;
+  std::vector<TComplex>  C4;
+  std::vector<TComplex>  C5;
+  
+  //////////////////////////////////////////////////////////
+  // Could do AA, VV, SS, PP, TT and form linear combinations later.
+  // Almost 2x. but for many modes, the above loop dominates.
+  //////////////////////////////////////////////////////////
+  double t_contr= -usecond();
+
+  // Tr Tr Wick contraction
+  auto VX = Gamma(Gamma::Algebra::GammaX);
+  auto VY = Gamma(Gamma::Algebra::GammaY);
+  auto VZ = Gamma(Gamma::Algebra::GammaZ);
+  auto VT = Gamma(Gamma::Algebra::GammaT);
+  
+  auto AX = Gamma(Gamma::Algebra::GammaXGamma5);
+  auto AY = Gamma(Gamma::Algebra::GammaYGamma5);
+  auto AZ = Gamma(Gamma::Algebra::GammaZGamma5);
+  auto AT = Gamma(Gamma::Algebra::GammaTGamma5);
+  
+  auto S  = Gamma(Gamma::Algebra::Identity);
+  auto P  = Gamma(Gamma::Algebra::Gamma5);
+  
+  auto T0 = Gamma(Gamma::Algebra::SigmaXY);
+  auto T1 = Gamma(Gamma::Algebra::SigmaXZ);
+  auto T2 = Gamma(Gamma::Algebra::SigmaXT);
+  auto T3 = Gamma(Gamma::Algebra::SigmaYZ);
+  auto T4 = Gamma(Gamma::Algebra::SigmaYT);
+  auto T5 = Gamma(Gamma::Algebra::SigmaZT);
+
+  std::cout <<GridLogMessage << " dt " <<dt_min<<"..." <<dt_max<<std::endl;
+
+  for(int t0=0;t0<N_t;t0++){
+    std::cout <<GridLogMessage << " t0 " <<t0<<std::endl;
+    //    for(int dt=dt_min;dt<dt_max;dt++){
+    {     
+      int dt  = dt_min;
+      int t1 = (t0+dt)%N_t;
+
+      std::cout <<GridLogMessage << " t1 " <<t1<<std::endl;
+      std::vector<Gamma> VV({VX,VY,VZ,VT});
+      std::vector<Gamma> AA({AX,AY,AZ,AT});
+      std::vector<Gamma> SS({S});
+      std::vector<Gamma> PP({P});
+      std::vector<Gamma> TT({T0,T1,T2,T3,T4,T5});
+      std::vector<Gamma> A0({AT});
+
+      ContractFourQuarkColourDiagonal(WWVV[t0],	WWVV[t1],VV,VV,VV_trtr,VV_fig8); // VV
+      ContractFourQuarkColourDiagonal(WWVV[t0],	WWVV[t1],AA,AA,AA_trtr,AA_fig8); // AA
+      ContractFourQuarkColourDiagonal(WWVV[t0],	WWVV[t1],SS,SS,SS_trtr,SS_fig8); // SS
+      ContractFourQuarkColourDiagonal(WWVV[t0],	WWVV[t1],PP,PP,PP_trtr,PP_fig8); // PP
+      ContractFourQuarkColourDiagonal(WWVV[t0],	WWVV[t1],TT,TT,TT_trtr,TT_fig8); // TT
+
+      O1_trtr = VV_trtr+AA_trtr;      O2_trtr = VV_trtr-AA_trtr;  // VV+AA,VV-AA
+      O1_fig8 = VV_fig8+AA_fig8;      O2_fig8 = VV_fig8-AA_fig8;  
+
+      O3_trtr = SS_trtr-PP_trtr;      O4_trtr = SS_trtr+PP_trtr;  // SS+PP,SS-PP
+      O3_fig8 = SS_fig8-PP_fig8;      O4_fig8 = SS_fig8+PP_fig8;  
+
+      O5_trtr = TT_trtr;
+      O5_fig8 = TT_fig8;
+
+      sliceSum(O1_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O2_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O3_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O4_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O5_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+
+      sliceSum(O1_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O2_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O3_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O4_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O5_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+
+      ContractFourQuarkColourDiagonal(WWVV[t0],	WWVV[t1],A0,A0,AA_trtr,AA_fig8); // A0 insertion
+
+      sliceSum(AA_trtr,C1, orthogdim);
+      sliceSum(PP_trtr,C2, orthogdim);
+
+      for(int t=0;t<N_t;t++){
+	denom_A0(t)+=C1[(t+t0)%N_t]()()()/vol;
+	denom_P(t) +=C2[(t+t0)%N_t]()()()/vol;
+      }
+
+      ///////////////////////////////////////////////////////////////////////////
+      // Colour mixed contractions
+      ///////////////////////////////////////////////////////////////////////////
+
+      ContractFourQuarkColourMix(WWVV[t0],	WWVV[t1],VV,VV,VV_trtr,VV_fig8); // VV
+      ContractFourQuarkColourMix(WWVV[t0],	WWVV[t1],AA,AA,AA_trtr,AA_fig8); // AA
+      ContractFourQuarkColourMix(WWVV[t0],	WWVV[t1],SS,SS,SS_trtr,SS_fig8); // SS
+      ContractFourQuarkColourMix(WWVV[t0],	WWVV[t1],PP,PP,PP_trtr,PP_fig8); // PP
+      ContractFourQuarkColourMix(WWVV[t0],	WWVV[t1],TT,TT,TT_trtr,TT_fig8); // TT
+
+      O1_trtr = VV_trtr+AA_trtr;      O2_trtr = VV_trtr-AA_trtr;  // VV+AA,VV-AA
+      O1_fig8 = VV_fig8+AA_fig8;      O2_fig8 = VV_fig8-AA_fig8;  
+
+      O3_trtr = SS_trtr-PP_trtr;      O4_trtr = SS_trtr+PP_trtr;  // SS+PP,SS-PP
+      O3_fig8 = SS_fig8-PP_fig8;      O4_fig8 = SS_fig8+PP_fig8;  
+
+      O5_trtr = TT_trtr;
+      O5_fig8 = TT_fig8;
+
+      sliceSum(O1_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O2_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O3_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O4_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O5_trtr,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_trtr_mix(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+
+      sliceSum(O1_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,0)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O2_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,1)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O3_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,2)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O4_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,3)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+      sliceSum(O5_fig8,C1, orthogdim); for(int t=0;t<N_t;t++) dF2_fig8_mix(t,4)+= 2.0*C1[(t+t0)%N_t]()()()/vol;
+
+    }
+  }
+  t_contr +=usecond();
+  
+  t_tot+=usecond();
+  double million=1.0e6;
+  LOG(Message) << "Computing A2A DeltaF=2 graph t_tot      " << t_tot      /million << " s "<< std::endl;
+  LOG(Message) << "Computing A2A DeltaF=2 graph t_outer    " << t_outer    /million << " s "<< std::endl;
+  LOG(Message) << "Computing A2A DeltaF=2 graph t_contr    " << t_contr    /million << " s "<< std::endl;
+}
+#endif 
+
+}}
+
diff --git a/lib/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h
similarity index 100%
rename from lib/qcd/utils/CovariantCshift.h
rename to Grid/qcd/utils/CovariantCshift.h
diff --git a/lib/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h
similarity index 98%
rename from lib/qcd/utils/CovariantLaplacian.h
rename to Grid/qcd/utils/CovariantLaplacian.h
index 461412dc..0e0620a7 100644
--- a/lib/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -26,9 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
-
-#ifndef COVARIANT_LAPLACIAN_H
-#define COVARIANT_LAPLACIAN_H
+#pragma once 
 
 NAMESPACE_BEGIN(Grid);
 
@@ -191,5 +189,3 @@ private:
 };
 
 NAMESPACE_END(Grid);
-
-#endif
diff --git a/lib/qcd/utils/GaugeFix.h b/Grid/qcd/utils/GaugeFix.h
similarity index 100%
rename from lib/qcd/utils/GaugeFix.h
rename to Grid/qcd/utils/GaugeFix.h
diff --git a/lib/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h
similarity index 100%
rename from lib/qcd/utils/LinalgUtils.h
rename to Grid/qcd/utils/LinalgUtils.h
diff --git a/lib/qcd/utils/Metric.h b/Grid/qcd/utils/Metric.h
similarity index 99%
rename from lib/qcd/utils/Metric.h
rename to Grid/qcd/utils/Metric.h
index 3389ab16..10d06de8 100644
--- a/lib/qcd/utils/Metric.h
+++ b/Grid/qcd/utils/Metric.h
@@ -27,8 +27,7 @@ directory
 *************************************************************************************/
 			   /*  END LEGAL */
 			   //--------------------------------------------------------------------
-#ifndef METRIC_H
-#define METRIC_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -213,4 +212,3 @@ public:
 
 NAMESPACE_END(Grid);
 
-#endif //METRIC_H
diff --git a/lib/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h
similarity index 100%
rename from lib/qcd/utils/SUn.h
rename to Grid/qcd/utils/SUn.h
diff --git a/lib/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h
similarity index 100%
rename from lib/qcd/utils/SUnAdjoint.h
rename to Grid/qcd/utils/SUnAdjoint.h
diff --git a/lib/qcd/utils/SUnTwoIndex.h b/Grid/qcd/utils/SUnTwoIndex.h
similarity index 100%
rename from lib/qcd/utils/SUnTwoIndex.h
rename to Grid/qcd/utils/SUnTwoIndex.h
diff --git a/lib/qcd/utils/ScalarObjs.h b/Grid/qcd/utils/ScalarObjs.h
similarity index 100%
rename from lib/qcd/utils/ScalarObjs.h
rename to Grid/qcd/utils/ScalarObjs.h
diff --git a/lib/qcd/utils/SpaceTimeGrid.cc b/Grid/qcd/utils/SpaceTimeGrid.cc
similarity index 100%
rename from lib/qcd/utils/SpaceTimeGrid.cc
rename to Grid/qcd/utils/SpaceTimeGrid.cc
diff --git a/lib/qcd/utils/SpaceTimeGrid.h b/Grid/qcd/utils/SpaceTimeGrid.h
similarity index 100%
rename from lib/qcd/utils/SpaceTimeGrid.h
rename to Grid/qcd/utils/SpaceTimeGrid.h
diff --git a/lib/qcd/utils/Utils.h b/Grid/qcd/utils/Utils.h
similarity index 75%
rename from lib/qcd/utils/Utils.h
rename to Grid/qcd/utils/Utils.h
index 1786db54..8ce3df37 100644
--- a/lib/qcd/utils/Utils.h
+++ b/Grid/qcd/utils/Utils.h
@@ -12,4 +12,10 @@
 #include <Grid/qcd/utils/SUnAdjoint.h>
 #include <Grid/qcd/utils/SUnTwoIndex.h>
 
+// All-to-all contraction kernels that touch the 
+// internal lattice structure
+#include <Grid/qcd/utils/A2Autils.h>
+
+
+
 #endif
diff --git a/lib/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h
similarity index 94%
rename from lib/qcd/utils/WilsonLoops.h
rename to Grid/qcd/utils/WilsonLoops.h
index 972c9291..d6f4c763 100644
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -123,6 +123,28 @@ public:
     return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
   }
 
+
+  //////////////////////////////////////////////////
+  // average over all x,y,z the temporal loop
+  //////////////////////////////////////////////////
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
+    GaugeMat Ut(Umu._grid), P(Umu._grid);
+    ComplexD out;
+    int T = Umu._grid->GlobalDimensions()[3];
+    int X = Umu._grid->GlobalDimensions()[0];
+    int Y = Umu._grid->GlobalDimensions()[1];
+    int Z = Umu._grid->GlobalDimensions()[2];
+
+    Ut = peekLorentz(Umu,3); //Select temporal direction
+    P = Ut;
+    for (int t=1;t<T;t++){ 
+      P = Gimpl::CovShiftForward(Ut,3,P);
+    }
+   RealD norm = 1.0/(Nc*X*Y*Z*T);
+   out = sum(trace(P))*norm;
+   return out;   
+}
+
   //////////////////////////////////////////////////
   // average over traced single links
   //////////////////////////////////////////////////
@@ -190,6 +212,7 @@ public:
 
 
   // For the force term
+/*
   static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
     GridBase *grid = Umu.Grid();
     std::vector<GaugeMat> U(Nd, grid);
@@ -203,7 +226,7 @@ public:
 
     for (int nu = 0; nu < Nd; nu++) {
       if (nu != mu) {
-        // this is ~10% faster than the Staple
+        // this is ~10% faster than the Staple  -- PAB: so what it gives the WRONG answers for other BC's!
         tmp1 = Cshift(U[nu], mu, 1);
         tmp2 = Cshift(U[mu], nu, 1);
         staple += tmp1* adj(U[nu]*tmp2);
@@ -213,7 +236,7 @@ public:
     }
     staple = U[mu]*staple;
   }
-
+*/
   //////////////////////////////////////////////////
   // the sum over all staples on each site
   //////////////////////////////////////////////////
@@ -291,9 +314,9 @@ public:
     }
   }
 
-  //////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
   // the sum over all staples on each site in direction mu,nu, lower part
-  //////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
   static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                           int nu) {
     if (nu != mu) {
@@ -315,7 +338,9 @@ public:
       //
       staple = Gimpl::ShiftStaple(
 				  Gimpl::CovShiftBackward(U[nu], nu,
-							  Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+          mu);
+
     }
   }
 
@@ -325,7 +350,7 @@ public:
   static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
     // Fmn +--<--+  Ut +--<--+
     //     |     |     |     |
-    //  (x)+-->--+     +-->--+(x)
+      //  (x)+-->--+     +-->--+(x)  - h.c.
     //     |     |     |     |
     //     +--<--+     +--<--+
 
@@ -335,7 +360,9 @@ public:
     GaugeMat v = Vup - Vdn;
     GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
     GaugeMat vu = v*u;
-    FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
+      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
+      FS = (u*v + Cshift(vu, mu, -1));
+      FS = 0.125*(FS - adj(FS));
   }
 
   static Real TopologicalCharge(GaugeLorentz &U){
@@ -360,6 +387,7 @@ public:
     return TensorRemove(Tq).real();
   }
 
+
   //////////////////////////////////////////////////////
   // Similar to above for rectangle is required
   //////////////////////////////////////////////////////
diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h
new file mode 100644
index 00000000..dd15e7da
--- /dev/null
+++ b/Grid/serialisation/BaseIO.h
@@ -0,0 +1,344 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/serialisation/BaseIO.h
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_SERIALISATION_ABSTRACT_READER_H
+#define GRID_SERIALISATION_ABSTRACT_READER_H
+
+#include <type_traits>
+#include <Grid/tensors/Tensors.h>
+#include <Grid/serialisation/VectorUtils.h>
+
+namespace Grid {
+  // Abstract writer/reader classes ////////////////////////////////////////////
+  // static polymorphism implemented using CRTP idiom
+  class Serializable;
+  
+  // Static abstract writer
+  template <typename T>
+  class Writer
+  {
+  public:
+    Writer(void);
+    virtual ~Writer(void) = default;
+    void push(const std::string &s);
+    void pop(void);
+    template <typename U>
+    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
+    write(const std::string& s, const U &output);
+    template <typename U>
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+    write(const std::string& s, const U &output);
+    template <typename U>
+    void write(const std::string &s, const iScalar<U> &output);
+    template <typename U, int N>
+    void write(const std::string &s, const iVector<U, N> &output);
+    template <typename U, int N>
+    void write(const std::string &s, const iMatrix<U, N> &output);
+    void         scientificFormat(const bool set);
+    bool         isScientific(void);
+    void         setPrecision(const unsigned int prec);
+    unsigned int getPrecision(void);
+  private:
+    T            *upcast;
+    bool         scientific_{false};
+    unsigned int prec_{0};
+  };
+  
+  // Static abstract reader
+  template <typename T>
+  class Reader
+  {
+  public:
+    Reader(void);
+    virtual ~Reader(void) = default;
+    bool push(const std::string &s);
+    void pop(void);
+    template <typename U>
+    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
+    read(const std::string& s, U &output);
+    template <typename U>
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+    read(const std::string& s, U &output);
+    template <typename U>
+    void read(const std::string &s, iScalar<U> &output);
+    template <typename U, int N>
+    void read(const std::string &s, iVector<U, N> &output);
+    template <typename U, int N>
+    void read(const std::string &s, iMatrix<U, N> &output);
+  protected:
+    template <typename U>
+    void fromString(U &output, const std::string &s);
+  private:
+    T *upcast;
+  };
+
+   // What is the vtype
+  template<typename T> struct isReader {
+    static const bool value = false;
+  };
+  template<typename T> struct isWriter {
+    static const bool value = false;
+  };
+
+  // Writer template implementation
+  template <typename T>
+  Writer<T>::Writer(void)
+  {
+    upcast = static_cast<T *>(this);
+  }
+  
+  template <typename T>
+  void Writer<T>::push(const std::string &s)
+  {
+    upcast->push(s);
+  }
+  
+  template <typename T>
+  void Writer<T>::pop(void)
+  {
+    upcast->pop();
+  }
+  
+  template <typename T>
+  template <typename U>
+  typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
+  Writer<T>::write(const std::string &s, const U &output)
+  {
+    U::write(*this, s, output);
+  }
+  
+  template <typename T>
+  template <typename U>
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+  Writer<T>::write(const std::string &s, const U &output)
+  {
+    upcast->writeDefault(s, output);
+  }
+
+  template <typename T>
+  template <typename U>
+  void Writer<T>::write(const std::string &s, const iScalar<U> &output)
+  {
+    upcast->writeDefault(s, tensorToVec(output));
+  }
+
+  template <typename T>
+  template <typename U, int N>
+  void Writer<T>::write(const std::string &s, const iVector<U, N> &output)
+  {
+    upcast->writeDefault(s, tensorToVec(output));
+  }
+
+  template <typename T>
+  template <typename U, int N>
+  void Writer<T>::write(const std::string &s, const iMatrix<U, N> &output)
+  {
+    upcast->writeDefault(s, tensorToVec(output));
+  }
+
+  template <typename T>
+  void Writer<T>::scientificFormat(const bool set)
+  {
+    scientific_ = set;
+  }
+
+  template <typename T>
+  bool Writer<T>::isScientific(void)
+  {
+    return scientific_;
+  }
+
+  template <typename T>
+  void Writer<T>::setPrecision(const unsigned int prec)
+  {
+    prec_ = prec;
+  }
+
+  template <typename T>
+  unsigned int Writer<T>::getPrecision(void)
+  {
+    return prec_;
+  }
+  
+  // Reader template implementation
+  template <typename T>
+  Reader<T>::Reader(void)
+  {
+    upcast = static_cast<T *>(this);
+  }
+  
+  template <typename T>
+  bool Reader<T>::push(const std::string &s)
+  {
+    return upcast->push(s);
+  }
+  
+  template <typename T>
+  void Reader<T>::pop(void)
+  {
+    upcast->pop();
+  }
+  
+  template <typename T>
+  template <typename U>
+  typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
+  Reader<T>::read(const std::string &s, U &output)
+  {
+    U::read(*this, s, output);
+  }
+  
+  template <typename T>
+  template <typename U>
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+  Reader<T>::read(const std::string &s, U &output)
+  {
+    upcast->readDefault(s, output);
+  }
+
+  template <typename T>
+  template <typename U>
+  void Reader<T>::read(const std::string &s, iScalar<U> &output)
+  {
+    typename TensorToVec<iScalar<U>>::type v;
+
+    upcast->readDefault(s, v);
+    vecToTensor(output, v);
+  }
+
+  template <typename T>
+  template <typename U, int N>
+  void Reader<T>::read(const std::string &s, iVector<U, N> &output)
+  {
+    typename TensorToVec<iVector<U, N>>::type v;
+    
+    upcast->readDefault(s, v);
+    vecToTensor(output, v);
+  }
+  
+  template <typename T>
+  template <typename U, int N>
+  void Reader<T>::read(const std::string &s, iMatrix<U, N> &output)
+  {
+    typename TensorToVec<iMatrix<U, N>>::type v;
+    
+    upcast->readDefault(s, v);
+    vecToTensor(output, v);
+  }
+
+  template <typename T>
+  template <typename U>
+  void Reader<T>::fromString(U &output, const std::string &s)
+  {
+    std::istringstream is(s);
+    
+    is.exceptions(std::ios::failbit);
+    try
+    {
+      is >> std::boolalpha >> output;
+    }
+    catch(std::ios_base::failure &e)
+    {
+      std::cerr << "numerical conversion failure on '" << s << "' ";
+      std::cerr << "(typeid: " << typeid(U).name() << ")" << std::endl;
+      abort();
+    }
+  }
+
+  // serializable base class ///////////////////////////////////////////////////
+  class Serializable
+  {
+  public:
+    template <typename T>
+    static inline void write(Writer<T> &WR,const std::string &s,
+                             const Serializable &obj)
+    {}
+    
+    template <typename T>
+    static inline void read(Reader<T> &RD,const std::string &s,
+                            Serializable &obj)
+    {}
+    
+    friend inline std::ostream & operator<<(std::ostream &os,
+                                            const Serializable &obj)
+    {
+      return os;
+    }
+  };
+  
+  // Generic writer interface //////////////////////////////////////////////////
+  template <typename T>
+  inline void push(Writer<T> &w, const std::string &s) {
+    w.push(s);
+  }
+  
+  template <typename T>
+  inline void push(Writer<T> &w, const char *s)
+  {
+    w.push(std::string(s));
+  }
+  
+  template <typename T>
+  inline void pop(Writer<T> &w)
+  {
+    w.pop();
+  }
+  
+  template <typename T, typename U>
+  inline void write(Writer<T> &w, const std::string& s, const U &output)
+  {
+    w.write(s, output);
+  }
+  
+  // Generic reader interface //////////////////////////////////////////////////
+  template <typename T>
+  inline bool push(Reader<T> &r, const std::string &s)
+  {
+    return r.push(s);
+  }
+  
+  template <typename T>
+  inline bool push(Reader<T> &r, const char *s)
+  {
+    return r.push(std::string(s));
+  }
+  
+  template <typename T>
+  inline void pop(Reader<T> &r)
+  {
+    r.pop();
+  }
+  
+  template <typename T, typename U>
+  inline void read(Reader<T> &r, const std::string &s, U &output)
+  {
+    r.read(s, output);
+  }
+}
+
+#endif
diff --git a/lib/serialisation/BinaryIO.cc b/Grid/serialisation/BinaryIO.cc
similarity index 100%
rename from lib/serialisation/BinaryIO.cc
rename to Grid/serialisation/BinaryIO.cc
diff --git a/lib/serialisation/BinaryIO.h b/Grid/serialisation/BinaryIO.h
similarity index 100%
rename from lib/serialisation/BinaryIO.h
rename to Grid/serialisation/BinaryIO.h
diff --git a/lib/serialisation/Hdf5IO.cc b/Grid/serialisation/Hdf5IO.cc
similarity index 90%
rename from lib/serialisation/Hdf5IO.cc
rename to Grid/serialisation/Hdf5IO.cc
index 1fb7be0c..77396809 100644
--- a/lib/serialisation/Hdf5IO.cc
+++ b/Grid/serialisation/Hdf5IO.cc
@@ -55,10 +55,15 @@ void Hdf5Writer::writeDefault(const std::string &s, const char *x)
   writeDefault(s, sx);
 }
 
+Group & Hdf5Writer::getGroup(void)
+{
+  return group_;
+}
+
 // Reader implementation ///////////////////////////////////////////////////////
-Hdf5Reader::Hdf5Reader(const std::string &fileName)
+Hdf5Reader::Hdf5Reader(const std::string &fileName, const bool readOnly)
 : fileName_(fileName)
-, file_(fileName.c_str(), H5F_ACC_RDONLY)
+, file_(fileName.c_str(), readOnly ? H5F_ACC_RDONLY : H5F_ACC_RDWR)
 {
   group_ = file_.openGroup("/");
   readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
@@ -103,3 +108,8 @@ void Hdf5Reader::readDefault(const std::string &s, std::string &x)
   x.resize(strType.getSize());
   attribute.read(strType, &(x[0]));
 }
+
+Group & Hdf5Reader::getGroup(void)
+{
+  return group_;
+}
diff --git a/lib/serialisation/Hdf5IO.h b/Grid/serialisation/Hdf5IO.h
similarity index 95%
rename from lib/serialisation/Hdf5IO.h
rename to Grid/serialisation/Hdf5IO.h
index 94ad9736..59804240 100644
--- a/lib/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@@ -5,6 +5,7 @@
 #include <string>
 #include <vector>
 #include <H5Cpp.h>
+#include <Grid/tensors/Tensors.h>
 #include "Hdf5Type.h"
 
 #ifndef H5_NO_NAMESPACE
@@ -37,6 +38,7 @@ namespace Grid
     template <typename U>
     typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
     writeDefault(const std::string &s, const std::vector<U> &x);
+    H5NS::Group & getGroup(void);
   private:
     template <typename U>
     void writeSingleAttribute(const U &x, const std::string &name,
@@ -52,7 +54,7 @@ namespace Grid
   class Hdf5Reader: public Reader<Hdf5Reader>
   {
   public:
-    Hdf5Reader(const std::string &fileName);
+    Hdf5Reader(const std::string &fileName, const bool readOnly = true);
     virtual ~Hdf5Reader(void) = default;
     bool push(const std::string &s);
     void pop(void);
@@ -64,6 +66,7 @@ namespace Grid
     template <typename U>
     typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
     readDefault(const std::string &s, std::vector<U> &x);
+    H5NS::Group & getGroup(void);
   private:
     template <typename U>
     void readSingleAttribute(U &x, const std::string &name,
@@ -120,9 +123,12 @@ namespace Grid
     
     if (flatx.size() > dataSetThres_)
     {
-      H5NS::DataSet dataSet;
+      H5NS::DataSet           dataSet;
+      H5NS::DSetCreatPropList plist;
       
-      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
+      plist.setChunk(dim.size(), dim.data());
+      plist.setFletcher32();
+      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace, plist);
       dataSet.write(flatx.data(), Hdf5Type<Element>::type());
     }
     else
diff --git a/lib/serialisation/Hdf5Type.h b/Grid/serialisation/Hdf5Type.h
similarity index 100%
rename from lib/serialisation/Hdf5Type.h
rename to Grid/serialisation/Hdf5Type.h
diff --git a/lib/serialisation/JSON_IO.cc b/Grid/serialisation/JSON_IO.cc
similarity index 100%
rename from lib/serialisation/JSON_IO.cc
rename to Grid/serialisation/JSON_IO.cc
diff --git a/lib/serialisation/JSON_IO.h b/Grid/serialisation/JSON_IO.h
similarity index 100%
rename from lib/serialisation/JSON_IO.h
rename to Grid/serialisation/JSON_IO.h
diff --git a/lib/serialisation/MacroMagic.h b/Grid/serialisation/MacroMagic.h
similarity index 91%
rename from lib/serialisation/MacroMagic.h
rename to Grid/serialisation/MacroMagic.h
index ae82bed3..9139fd98 100644
--- a/lib/serialisation/MacroMagic.h
+++ b/Grid/serialisation/MacroMagic.h
@@ -109,11 +109,11 @@ THE SOFTWARE.
 #define GRID_MACRO_MEMBER(A,B)        A B;
 #define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B));
 #define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" " #B << " = " << obj. B << " ; " <<std::endl;
-#define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
-#define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
+#define GRID_MACRO_READ_MEMBER(A,B)  ::Grid::read(RD,#B,obj. B);
+#define GRID_MACRO_WRITE_MEMBER(A,B) ::Grid::write(WR,#B,obj. B);
 
 #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\
-  std::string SerialisableClassName(void) {return std::string(#cname);}	\
+  std::string SerialisableClassName(void) const {return std::string(#cname);}	\
 GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
 template <typename T>\
 static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
@@ -125,7 +125,7 @@ template <typename T>\
 static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
   if (!push(RD,s))\
   {\
-    std::cout << Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl;\
+    std::cout << ::Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl; \
     return;\
   };\
   GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
@@ -145,12 +145,12 @@ friend inline bool operator==(const cname &lhs, const cname &rhs) {\
 
 #define GRID_ENUM_TYPE(obj) std::remove_reference<decltype(obj)>::type
 #define GRID_MACRO_ENUMVAL(A,B) A = B,
-#define GRID_MACRO_ENUMCASE(A,B) case GRID_ENUM_TYPE(obj)::A: Grid::write(WR,s,#A); break;
+#define GRID_MACRO_ENUMCASE(A,B) case GRID_ENUM_TYPE(obj)::A: ::Grid::write(WR,s,#A); break;
 #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;}
 #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break;
 
 #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\
-class name: public Grid::Serializable\
+class name: public ::Grid::Serializable	  \
 {\
 public:\
   enum\
@@ -162,20 +162,20 @@ public:\
   accelerator name(void)     : value_(undefname) {};		\
   accelerator name(int value): value_(value) {};			\
   template <typename T>\
-  static inline void write(Grid::Writer<T> &WR,const std::string &s, const name &obj)\
+  static inline void write(::Grid::Writer<T> &WR,const std::string &s, const name &obj) \
   {\
     switch (obj.value_)\
     {\
       GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
-      default: Grid::write(WR,s,#undefname); break;\
+      default: ::Grid::write(WR,s,#undefname); break;\
     }\
   }\
   \
   template <typename T>\
-  static inline void read(Grid::Reader<T> &RD,const std::string &s, name &obj)\
+  static inline void read(::Grid::Reader<T> &RD,const std::string &s, name &obj)\
   {\
     std::string buf;\
-    Grid::read(RD, s, buf);\
+    ::Grid::read(RD, s, buf);\
     if (buf == #undefname) {obj = name::undefname;}\
     GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
     else {obj = name::undefname;}\
diff --git a/lib/serialisation/Serialisation.h b/Grid/serialisation/Serialisation.h
similarity index 100%
rename from lib/serialisation/Serialisation.h
rename to Grid/serialisation/Serialisation.h
diff --git a/lib/serialisation/TextIO.cc b/Grid/serialisation/TextIO.cc
similarity index 93%
rename from lib/serialisation/TextIO.cc
rename to Grid/serialisation/TextIO.cc
index 4e71adf6..ee1d01cd 100644
--- a/lib/serialisation/TextIO.cc
+++ b/Grid/serialisation/TextIO.cc
@@ -29,14 +29,13 @@
     /*  END LEGAL */
 #include <Grid/GridCore.h>
 
-//using namespace Grid;
-//using namespace std;
-
 #define GRID_TEXT_INDENT 2      //number of spaces for indentation of levels
 
 NAMESPACE_BEGIN(Grid);
-
 // Writer implementation ///////////////////////////////////////////////////////
+TextWriter::TextWriter(const std::string &fileName)
+: file_(fileName, std::ios::out)
+{}
 
 void TextWriter::push(const std::string &s)
 {
@@ -58,7 +57,7 @@ void TextWriter::indent(void)
 // Reader implementation ///////////////////////////////////////////////////////
 TextReader::TextReader(const std::string &fileName) 
 {
-  file_.open(fileName, std::ios::in);
+    file_.open(fileName, std::ios::in);
     if (!file_.is_open()) {
         std::cout << GridLogMessage << "TextReader: Error opening file " << fileName << std::endl;
         exit(1);// write better error handling
@@ -98,9 +97,8 @@ void TextReader::checkIndent(void)
 template <>
 void TextReader::readDefault(const std::string &s, std::string &output)
 {
-  checkIndent();
-  output.clear();
-  getline(file_, output);
+    checkIndent();
+    output.clear();
+    getline(file_, output);
 }
-
 NAMESPACE_END(Grid);
diff --git a/lib/serialisation/TextIO.h b/Grid/serialisation/TextIO.h
similarity index 90%
rename from lib/serialisation/TextIO.h
rename to Grid/serialisation/TextIO.h
index dd788d84..ada1aaf6 100644
--- a/lib/serialisation/TextIO.h
+++ b/Grid/serialisation/TextIO.h
@@ -43,7 +43,7 @@ namespace Grid
   class TextWriter: public Writer<TextWriter>
   {
   public:
-    TextWriter(const std::string &fileName) : file_(fileName, std::ios::out) {};
+    TextWriter(const std::string &fileName);
     virtual ~TextWriter(void) = default;
     void push(const std::string &s);
     void pop(void);
@@ -65,8 +65,10 @@ namespace Grid
     virtual ~TextReader(void) = default;
     bool push(const std::string &s);
     void pop(void);
-    template <typename U>  void readDefault(const std::string &s, U &output);
-    template <typename U>  void readDefault(const std::string &s, std::vector<U> &output);
+    template <typename U>
+    void readDefault(const std::string &s, U &output);
+    template <typename U>
+    void readDefault(const std::string &s, std::vector<U> &output);
   private:
     void checkIndent(void);
   private:
@@ -95,8 +97,6 @@ namespace Grid
   }
   
   // Reader template implementation ////////////////////////////////////////////
-  template <>  void TextReader::readDefault(const std::string &s, std::string &output);
-
   template <typename U>
   void TextReader::readDefault(const std::string &s, U &output)
   {
@@ -106,6 +106,8 @@ namespace Grid
     fromString(output, buf);
   }
   
+  template <>
+  void TextReader::readDefault(const std::string &s, std::string &output);
   
   template <typename U>
   void TextReader::readDefault(const std::string &s, std::vector<U> &output)
diff --git a/Grid/serialisation/VectorUtils.h b/Grid/serialisation/VectorUtils.h
new file mode 100644
index 00000000..b6b95c10
--- /dev/null
+++ b/Grid/serialisation/VectorUtils.h
@@ -0,0 +1,439 @@
+#ifndef GRID_SERIALISATION_VECTORUTILS_H
+#define GRID_SERIALISATION_VECTORUTILS_H
+
+#include <type_traits>
+#include <Grid/tensors/Tensors.h>
+
+namespace Grid {
+  // Pair IO utilities /////////////////////////////////////////////////////////
+  // helper function to parse input in the format "<obj1 obj2>"
+  template <typename T1, typename T2>
+  inline std::istream & operator>>(std::istream &is, std::pair<T1, T2> &buf)
+  {
+    T1 buf1;
+    T2 buf2;
+    char c;
+
+    // Search for "pair" delimiters.
+    do
+    {
+      is.get(c);
+    } while (c != '(' && !is.eof());
+    if (c == '(')
+    {
+      int start = is.tellg();
+      do
+      {
+        is.get(c);
+      } while (c != ')' && !is.eof());
+      if (c == ')')
+      {
+        int end = is.tellg();
+        int psize = end - start - 1;
+
+        // Only read data between pair limiters.
+        is.seekg(start);
+        std::string tmpstr(psize, ' ');
+        is.read(&tmpstr[0], psize);
+        std::istringstream temp(tmpstr);
+        temp >> buf1 >> buf2;
+        buf = std::make_pair(buf1, buf2);
+        is.seekg(end);
+      }
+    }
+    is.peek();
+    return is;
+  }
+  
+  // output to streams for pairs
+  template <class T1, class T2>
+  inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
+  {
+    os << "(" << p.first << " " << p.second << ")";
+    return os;
+  }
+  
+  // Grid scalar tensors to nested std::vectors //////////////////////////////////
+  template <typename T>
+  struct TensorToVec
+  {
+    typedef T type;
+  };
+
+  template <typename T>
+  struct TensorToVec<iScalar<T>>
+  {
+    typedef typename TensorToVec<T>::type type;
+  };
+
+  template <typename T, int N>
+  struct TensorToVec<iVector<T, N>>
+  {
+    typedef typename std::vector<typename TensorToVec<T>::type> type;
+  };
+
+  template <typename T, int N>
+  struct TensorToVec<iMatrix<T, N>>
+  {
+    typedef typename std::vector<std::vector<typename TensorToVec<T>::type>> type;
+  };
+
+  template <typename T>
+  void tensorDim(std::vector<size_t> &dim, const T &t, const bool wipe = true)
+  {
+    if (wipe)
+    {
+      dim.clear();
+    }
+  }
+
+  template <typename T>
+  void tensorDim(std::vector<size_t> &dim, const iScalar<T> &t, const bool wipe = true)
+  {
+    if (wipe)
+    {
+      dim.clear();
+    }
+    tensorDim(dim, t._internal, false);
+  }
+
+  template <typename T, int N>
+  void tensorDim(std::vector<size_t> &dim, const iVector<T, N> &t, const bool wipe = true)
+  {
+    if (wipe)
+    {
+      dim.clear();
+    }
+    dim.push_back(N);
+    tensorDim(dim, t._internal[0], false);
+  }
+
+  template <typename T, int N>
+  void tensorDim(std::vector<size_t> &dim, const iMatrix<T, N> &t, const bool wipe = true)
+  {
+    if (wipe)
+    {
+      dim.clear();
+    }
+    dim.push_back(N);
+    dim.push_back(N);
+    tensorDim(dim, t._internal[0][0], false);
+  }
+
+  template <typename T>
+  typename TensorToVec<T>::type tensorToVec(const T &t)
+  {
+    return t;
+  }
+
+  template <typename T>
+  typename TensorToVec<iScalar<T>>::type tensorToVec(const iScalar<T>& t)
+  {
+    return tensorToVec(t._internal);
+  }
+
+  template <typename T, int N>
+  typename TensorToVec<iVector<T, N>>::type tensorToVec(const iVector<T, N>& t)
+  {
+    typename TensorToVec<iVector<T, N>>::type v;
+
+    v.resize(N);
+    for (unsigned int i = 0; i < N; i++) 
+    {
+      v[i] = tensorToVec(t._internal[i]);
+    }
+
+    return v;
+  }
+
+  template <typename T, int N>
+  typename TensorToVec<iMatrix<T, N>>::type tensorToVec(const iMatrix<T, N>& t)
+  {
+    typename TensorToVec<iMatrix<T, N>>::type v;
+
+    v.resize(N);
+    for (unsigned int i = 0; i < N; i++)
+    {
+      v[i].resize(N);
+      for (unsigned int j = 0; j < N; j++) 
+      {
+        v[i][j] = tensorToVec(t._internal[i][j]);
+      }
+    }
+
+    return v;
+  }
+
+  template <typename T>
+  void vecToTensor(T &t, const typename TensorToVec<T>::type &v)
+  {
+    t = v;
+  }
+
+
+  template <typename T>
+  void vecToTensor(iScalar<T> &t, const typename TensorToVec<iScalar<T>>::type &v)
+  {
+    vecToTensor(t._internal, v);
+  }
+
+  template <typename T, int N>
+  void vecToTensor(iVector<T, N> &t, const typename TensorToVec<iVector<T, N>>::type &v)
+  {
+    for (unsigned int i = 0; i < N; i++) 
+    {
+      vecToTensor(t._internal[i], v[i]);
+    }
+  }
+
+  template <typename T, int N>
+  void vecToTensor(iMatrix<T, N> &t, const typename TensorToVec<iMatrix<T, N>>::type &v)
+  {
+    for (unsigned int i = 0; i < N; i++)
+    for (unsigned int j = 0; j < N; j++)
+    {
+      vecToTensor(t._internal[i][j], v[i][j]);
+    }
+  }
+
+  // Vector element trait //////////////////////////////////////////////////////  
+  template <typename T>
+  struct element
+  {
+    typedef T type;
+    static constexpr bool is_number = false;
+  };
+  
+  template <typename T>
+  struct element<std::vector<T>>
+  {
+    typedef typename element<T>::type type;
+    static constexpr bool is_number = std::is_arithmetic<T>::value
+                                      or is_complex<T>::value
+                                      or element<T>::is_number;
+  };
+  
+  // Vector flattening utility class ////////////////////////////////////////////
+  // Class to flatten a multidimensional std::vector
+  template <typename V>
+  class Flatten
+  {
+  public:
+    typedef typename element<V>::type Element;
+  public:
+    explicit                     Flatten(const V &vector);
+    const V &                    getVector(void);
+    const std::vector<Element> & getFlatVector(void);
+    const std::vector<size_t>  & getDim(void);
+  private:
+    void accumulate(const Element &e);
+    template <typename W>
+    void accumulate(const W &v);
+    void accumulateDim(const Element &e);
+    template <typename W>
+    void accumulateDim(const W &v);
+  private:
+    const V              &vector_;
+    std::vector<Element> flatVector_;
+    std::vector<size_t>  dim_;
+  };
+  
+  // Class to reconstruct a multidimensional std::vector
+  template <typename V>
+  class Reconstruct
+  {
+  public:
+    typedef typename element<V>::type Element;
+  public:
+    Reconstruct(const std::vector<Element> &flatVector,
+                const std::vector<size_t> &dim);
+    const V &                    getVector(void);
+    const std::vector<Element> & getFlatVector(void);
+    const std::vector<size_t>  & getDim(void);
+  private:
+    void fill(std::vector<Element> &v);
+    template <typename W>
+    void fill(W &v);
+    void resize(std::vector<Element> &v, const unsigned int dim);
+    template <typename W>
+    void resize(W &v, const unsigned int dim);
+  private:
+    V                          vector_;
+    const std::vector<Element> &flatVector_;
+    std::vector<size_t>        dim_;
+    size_t                     ind_{0};
+    unsigned int               dimInd_{0};
+  };
+
+  // Flatten class template implementation
+  template <typename V>
+  void Flatten<V>::accumulate(const Element &e)
+  {
+    flatVector_.push_back(e);
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Flatten<V>::accumulate(const W &v)
+  {
+    for (auto &e: v)
+    {
+      accumulate(e);
+    }
+  }
+  
+  template <typename V>
+  void Flatten<V>::accumulateDim(const Element &e) {};
+  
+  template <typename V>
+  template <typename W>
+  void Flatten<V>::accumulateDim(const W &v)
+  {
+    dim_.push_back(v.size());
+    accumulateDim(v[0]);
+  }
+  
+  template <typename V>
+  Flatten<V>::Flatten(const V &vector)
+  : vector_(vector)
+  {
+    accumulate(vector_);
+    accumulateDim(vector_);
+  }
+  
+  template <typename V>
+  const V & Flatten<V>::getVector(void)
+  {
+    return vector_;
+  }
+  
+  template <typename V>
+  const std::vector<typename Flatten<V>::Element> &
+  Flatten<V>::getFlatVector(void)
+  {
+    return flatVector_;
+  }
+  
+  template <typename V>
+  const std::vector<size_t> & Flatten<V>::getDim(void)
+  {
+    return dim_;
+  }
+  
+  // Reconstruct class template implementation
+  template <typename V>
+  void Reconstruct<V>::fill(std::vector<Element> &v)
+  {
+    for (auto &e: v)
+    {
+      e = flatVector_[ind_++];
+    }
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Reconstruct<V>::fill(W &v)
+  {
+    for (auto &e: v)
+    {
+      fill(e);
+    }
+  }
+  
+  template <typename V>
+  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
+  {
+    v.resize(dim_[dim]);
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Reconstruct<V>::resize(W &v, const unsigned int dim)
+  {
+    v.resize(dim_[dim]);
+    for (auto &e: v)
+    {
+      resize(e, dim + 1);
+    }
+  }
+  
+  template <typename V>
+  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
+                              const std::vector<size_t> &dim)
+  : flatVector_(flatVector)
+  , dim_(dim)
+  {
+    resize(vector_, 0);
+    fill(vector_);
+  }
+  
+  template <typename V>
+  const V & Reconstruct<V>::getVector(void)
+  {
+    return vector_;
+  }
+  
+  template <typename V>
+  const std::vector<typename Reconstruct<V>::Element> &
+  Reconstruct<V>::getFlatVector(void)
+  {
+    return flatVector_;
+  }
+  
+  template <typename V>
+  const std::vector<size_t> & Reconstruct<V>::getDim(void)
+  {
+    return dim_;
+  }
+
+  // Vector IO utilities ///////////////////////////////////////////////////////
+  // helper function to read space-separated values
+  template <typename T>
+  std::vector<T> strToVec(const std::string s)
+  {
+    std::istringstream sstr(s);
+    T                  buf;
+    std::vector<T>     v;
+    
+    while(!sstr.eof())
+    {
+      sstr >> buf;
+      v.push_back(buf);
+    }
+    
+    return v;
+  }
+  
+  // output to streams for vectors
+  template < class T >
+  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
+  {
+    os << "[";
+    for (unsigned int i = 0; i < v.size(); ++i)
+    {
+      os << v[i];
+      if (i < v.size() - 1)
+      {
+        os << " ";
+      }
+    }
+    os << "]";
+    
+    return os;
+  }
+}
+
+// helper function to read space-separated values
+template <typename T>
+std::string vecToStr(const std::vector<T> &v)
+{
+  using Grid::operator<<;
+  
+  std::ostringstream sstr;
+
+  sstr << v;
+
+  return sstr.str();
+}
+
+#endif
\ No newline at end of file
diff --git a/lib/serialisation/XmlIO.cc b/Grid/serialisation/XmlIO.cc
similarity index 59%
rename from lib/serialisation/XmlIO.cc
rename to Grid/serialisation/XmlIO.cc
index 6db9439a..e45108f2 100644
--- a/lib/serialisation/XmlIO.cc
+++ b/Grid/serialisation/XmlIO.cc
@@ -28,11 +28,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid/GridCore.h>
 
-//using namespace Grid;
-//using namespace std;
-
 NAMESPACE_BEGIN(Grid);
 
+void Grid::xmlCheckParse(const pugi::xml_parse_result &result, const std::string name)
+{
+  if (!result) 
+  {
+    std::cerr << "XML parsing error for " << name << std::endl;
+    std::cerr << "XML error description: " << result.description() << std::endl;
+    std::cerr << "XML error offset     : " << result.offset << std::endl;
+    abort();
+  }
+}
+
 // Writer implementation ///////////////////////////////////////////////////////
 XmlWriter::XmlWriter(const std::string &fileName, std::string toplev) : fileName_(fileName)
 {
@@ -47,7 +55,7 @@ XmlWriter::XmlWriter(const std::string &fileName, std::string toplev) : fileName
 XmlWriter::~XmlWriter(void)
 {
   if ( fileName_ != std::string("") ) { 
-    doc_.save_file(fileName_.c_str(), "  ");
+    doc_.save_file(fileName_.c_str(), indent_.c_str());
   }
 }
 
@@ -56,25 +64,55 @@ void XmlWriter::push(const std::string &s)
   node_ = node_.append_child(s.c_str());
 }
 
+void XmlWriter::pushXmlString(const std::string &s)
+{
+  pugi::xml_document doc;
+  auto               result = doc.load_buffer(s.c_str(), s.size());
+
+  xmlCheckParse(result, "fragment\n'" + s +"'");
+  for (pugi::xml_node child = doc.first_child(); child; child = child.next_sibling())
+  {
+      node_ = node_.append_copy(child);
+  }
+  pop();
+}
+
 void XmlWriter::pop(void)
 {
   node_ = node_.parent();
 }
-std::string XmlWriter::XmlString(void)
+
+std::string XmlWriter::docString(void)
 {
   std::ostringstream oss; 
-  doc_.save(oss);
+  doc_.save(oss, indent_.c_str());
   return oss.str();
 }
 
-XmlReader::XmlReader(const char *xmlstring,std::string toplev) : fileName_("")
+std::string XmlWriter::string(void)
+{
+  std::ostringstream oss; 
+  doc_.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
+  return oss.str();
+}
+
+// Reader implementation ///////////////////////////////////////////////////////
+XmlReader::XmlReader(const std::string &s,  const bool isBuffer, 
+                     std::string toplev) 
 {
   pugi::xml_parse_result result;
-  result = doc_.load_string(xmlstring);
-  if ( !result ) {
-    std::cerr << "XML error description (from char *): " << result.description() << "\nXML\n"<< xmlstring << "\n";
-    std::cerr << "XML error offset      (from char *) " << result.offset         << "\nXML\n"<< xmlstring <<"\n";
-    std::abort();
+  
+  if (isBuffer)
+  {
+    fileName_ = "<string>";
+    result    = doc_.load_string(s.c_str());
+    xmlCheckParse(result, "string\n'" + s + "'");
+  }
+  else
+  {
+    fileName_ = s;
+    result    = doc_.load_file(s.c_str());
+    xmlCheckParse(result, "file '" + fileName_ + "'");
   }
   if ( toplev == std::string("") ) {
     node_ = doc_;
@@ -83,34 +121,26 @@ XmlReader::XmlReader(const char *xmlstring,std::string toplev) : fileName_("")
   }
 }
 
-// Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const std::string &fileName,std::string toplev) : fileName_(fileName)
-{
-  pugi::xml_parse_result result;
-  result = doc_.load_file(fileName_.c_str());
-  if ( !result ) {
-    std::cerr << "XML error description: " << result.description() <<" "<< fileName_ <<"\n";
-    std::cerr << "XML error offset     : " << result.offset        <<" "<< fileName_ <<"\n";
-    std::abort();
-  }
-  if ( toplev == std::string("") ) {
-    node_ = doc_;
-  } else { 
-    node_ = doc_.child(toplev.c_str());
-  }
+#define XML_SAFE_NODE(expr)\
+if (expr)\
+{\
+  node_ = expr;\
+  return true;\
+}\
+else\
+{\
+  return false;\
 }
 
 bool XmlReader::push(const std::string &s)
 {
-  if (node_.child(s.c_str()))
+  if (s.empty())
   {
-    node_ = node_.child(s.c_str());
-
-    return true;
+    XML_SAFE_NODE(node_.first_child());
   }
   else
   {
-    return false;
+    XML_SAFE_NODE(node_.child(s.c_str()));
   }
 }
 
@@ -121,17 +151,24 @@ void XmlReader::pop(void)
 
 bool XmlReader::nextElement(const std::string &s)
 {
-  if (node_.next_sibling(s.c_str()))
+  if (s.empty())
   {
-    node_ = node_.next_sibling(s.c_str());
-    
-    return true;
+    XML_SAFE_NODE(node_.next_sibling());
   }
   else
   {
-    return false;
+    XML_SAFE_NODE(node_.next_sibling(s.c_str()));
   }
+}
 
+void XmlReader::readCurrentSubtree(std::string &s)
+{
+  std::ostringstream oss; 
+  pugi::xml_document doc;
+
+  doc.append_copy(node_);
+  doc.save(oss, indent_.c_str(), pugi::format_default | pugi::format_no_declaration);
+  s = oss.str();
 }
 
 template <>
diff --git a/lib/serialisation/XmlIO.h b/Grid/serialisation/XmlIO.h
similarity index 83%
rename from lib/serialisation/XmlIO.h
rename to Grid/serialisation/XmlIO.h
index b1d84fbf..d8636faf 100644
--- a/lib/serialisation/XmlIO.h
+++ b/Grid/serialisation/XmlIO.h
@@ -43,6 +43,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid
 {
+  void xmlCheckParse(const pugi::xml_parse_result &result, const std::string name);
   
   class XmlWriter: public Writer<XmlWriter>
   {    
@@ -50,13 +51,16 @@ namespace Grid
     XmlWriter(const std::string &fileName,std::string toplev = std::string("grid") );
     virtual ~XmlWriter(void);
     void push(const std::string &s);
+    void pushXmlString(const std::string &s);
     void pop(void);
     template <typename U>
     void writeDefault(const std::string &s, const U &x);
     template <typename U>
     void writeDefault(const std::string &s, const std::vector<U> &x);
-    std::string XmlString(void);
+    std::string docString(void);
+    std::string string(void);
   private:
+    const std::string  indent_{"  "};
     pugi::xml_document doc_;
     pugi::xml_node     node_;
     std::string        fileName_;
@@ -65,16 +69,21 @@ namespace Grid
   class XmlReader: public Reader<XmlReader>
   {
   public:
-    XmlReader(const char *xmlstring,std::string toplev = std::string("grid") );
-    XmlReader(const std::string &fileName,std::string toplev = std::string("grid") );
+    XmlReader(const std::string &fileName, const bool isBuffer = false, 
+              std::string toplev = std::string("grid") );
     virtual ~XmlReader(void) = default;
-    bool push(const std::string &s);
+    bool push(const std::string &s = "");
     void pop(void);
-    bool nextElement(const std::string &s);
+    bool nextElement(const std::string &s = "");
     template <typename U>
     void readDefault(const std::string &s, U &output);
     template <typename U>    void readDefault(const std::string &s, std::vector<U> &output);
+    void readCurrentSubtree(std::string &s);
+
   private:
+    void checkParse(const pugi::xml_parse_result &result, const std::string name);
+  private:
+    const std::string  indent_{"  "};
     pugi::xml_document doc_;
     pugi::xml_node     node_;
     std::string        fileName_;
@@ -96,6 +105,14 @@ namespace Grid
   {
     std::ostringstream os;
     
+    if (this->getPrecision())
+    {
+      os.precision(this->getPrecision());
+    }
+    if (isScientific())
+    {
+      os << std::scientific;
+    }
     os << std::boolalpha << x;
     pugi::xml_node leaf = node_.append_child(s.c_str());
     leaf.append_child(pugi::node_pcdata).set_value(os.str().c_str());
diff --git a/lib/simd/BGQQPX.h b/Grid/simd/BGQQPX.h
similarity index 100%
rename from lib/simd/BGQQPX.h
rename to Grid/simd/BGQQPX.h
diff --git a/lib/simd/Grid_avx.h b/Grid/simd/Grid_avx.h
similarity index 100%
rename from lib/simd/Grid_avx.h
rename to Grid/simd/Grid_avx.h
diff --git a/lib/simd/Grid_avx512.h b/Grid/simd/Grid_avx512.h
similarity index 99%
rename from lib/simd/Grid_avx512.h
rename to Grid/simd/Grid_avx512.h
index d22a3b81..826dfc3e 100644
--- a/lib/simd/Grid_avx512.h
+++ b/Grid/simd/Grid_avx512.h
@@ -551,7 +551,7 @@ inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
   v3  = _mm256_add_epi32(v1, v2);
   v1  = _mm256_hadd_epi32(v3, v3);
   v2  = _mm256_hadd_epi32(v1, v1);
-  u1  = _mm256_castsi256_si128(v2)        // upper half
+    u1  = _mm256_castsi256_si128(v2);        // upper half
     u2  = _mm256_extracti128_si256(v2, 1);  // lower half
   ret = _mm_add_epi32(u1, u2);
   return _mm_cvtsi128_si32(ret);
diff --git a/lib/simd/Grid_generic.h b/Grid/simd/Grid_generic.h
similarity index 100%
rename from lib/simd/Grid_generic.h
rename to Grid/simd/Grid_generic.h
diff --git a/lib/simd/Grid_generic_types.h b/Grid/simd/Grid_generic_types.h
similarity index 100%
rename from lib/simd/Grid_generic_types.h
rename to Grid/simd/Grid_generic_types.h
diff --git a/lib/simd/Grid_gpu.h b/Grid/simd/Grid_gpu.h
similarity index 100%
rename from lib/simd/Grid_gpu.h
rename to Grid/simd/Grid_gpu.h
diff --git a/lib/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h
similarity index 100%
rename from lib/simd/Grid_gpu_vec.h
rename to Grid/simd/Grid_gpu_vec.h
diff --git a/Grid/simd/Grid_imci.h b/Grid/simd/Grid_imci.h
new file mode 100644
index 00000000..a1dae565
--- /dev/null
+++ b/Grid/simd/Grid_imci.h
@@ -0,0 +1,448 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Grid_imci.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+namespace Grid{
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline __m512 operator()(float a, float b){
+      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m512 operator()(float a){
+      return _mm512_set1_ps(a);
+    }
+    //Complex double
+    inline __m512d operator()(double a, double b){
+      return _mm512_set_pd(b,a,b,a,b,a,b,a);
+    }
+    //Real double
+    inline __m512d operator()(double a){
+      return _mm512_set1_pd(a);
+    }
+    //Integer
+    inline __m512i operator()(Integer a){
+      return _mm512_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m512 a, float* F){
+      _mm512_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m512d a, double* D){
+      _mm512_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m512i a, Integer* I){
+      _mm512_store_si512((__m512i *)I,a);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m512 b){
+      _mm512_storenrngo_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m512d b){
+      _mm512_storenrngo_pd(a,b);
+    }
+
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m512 operator()(Grid::ComplexF *a){
+      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
+			   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
+			   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m512d operator()(Grid::ComplexD *a){
+      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m512 operator()(float *a){
+      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m512d operator()(double *a){
+      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m512i operator()(Integer *a){
+      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_add_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_sub_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_sub_epi32(a,b);
+    }
+  };
+
+
+  struct MultComplex{
+    // Complex float
+    inline __m512 operator()(__m512 a, __m512 b){
+      __m512 vzero,ymm0,ymm1,real, imag;
+      vzero = _mm512_setzero_ps();
+      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
+      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
+      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
+      ymm1  = _mm512_mul_ps(real, b);
+      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
+      return _mm512_fmadd_ps(ymm0,imag,ymm1);
+    }
+    // Complex double
+    inline __m512d operator()(__m512d a, __m512d b){
+      /* This is from
+       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
+       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
+       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
+       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
+       * booktitle = {Proceedings of the International Conference on Supercomputing},
+       * series = {ICS '11},
+       * year = {2011},
+       * isbn = {978-1-4503-0102-2},
+       * location = {Tucson, Arizona, USA},
+       * pages = {265--274},
+       * numpages = {10},
+       * url = {http://doi.acm.org/10.1145/1995896.1995938},
+       * doi = {10.1145/1995896.1995938},
+       * acmid = {1995938},
+       * publisher = {ACM},
+       * address = {New York, NY, USA},
+       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
+       *                } 
+       */
+      __m512d vzero,ymm0,ymm1,real,imag;
+      vzero =_mm512_setzero_pd();
+      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
+      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
+      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
+      ymm1 =  _mm512_mul_pd(real, b);
+      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
+      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
+    }
+  };
+  
+  struct Mult{
+
+    inline void mac(__m512 &a, __m512 b, __m512 c){         
+       a= _mm512_fmadd_ps( b, c, a);                         
+    }
+
+    inline void mac(__m512d &a, __m512d b, __m512d c){
+      a= _mm512_fmadd_pd( b, c, a);                   
+    }                                             
+
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_mul_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_mul_pd(a,b);
+    }
+    // Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_mullo_epi32(a,b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_div_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_div_pd(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m512 operator()(__m512 in){
+      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
+    }
+    // Complex double
+    inline __m512d operator()(__m512d in){
+      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
+    }
+
+
+  };
+
+
+   struct Permute{
+    
+    static inline __m512 Permute0(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512 Permute1(__m512 in){
+      return _mm512_permute4f128_ps(in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(2,3,0,1));
+    };
+    static inline __m512 Permute2(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512 Permute3(__m512 in){
+      return _mm512_swizzle_ps(in,_MM_SWIZ_REG_CDAB); 
+    };
+
+    static inline __m512d Permute0(__m512d in){// Hack no intrinsic for 256 swaps of __m512d
+      return (__m512d)_mm512_permute4f128_ps((__m512)in,(_MM_PERM_ENUM)_MM_SELECT_FOUR_FOUR(1,0,3,2));
+    };
+    static inline __m512d Permute1(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_BADC);
+    };
+    static inline __m512d Permute2(__m512d in){
+      return _mm512_swizzle_pd(in,_MM_SWIZ_REG_CDAB);
+    };
+    static inline __m512d Permute3(__m512d in){
+      return in;
+    };
+
+  };
+ 
+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
+    };
+
+  };
+
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    return _mm512_reduce_add_ps(in);
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    return _mm512_reduce_add_pd(in);
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    return _mm512_reduce_add_epi32(in);
+  }
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+
+  typedef __m512 SIMD_Ftype;  // Single precision type
+  typedef __m512d SIMD_Dtype; // Double precision type
+  typedef __m512i SIMD_Itype; // Integer type
+
+  // prefecth
+  inline void v_prefetch0(int size, const char *ptr){
+    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
+      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+    }
+  }
+  inline void prefetch_HINT_T0(const char *ptr){
+    _mm_prefetch(ptr,_MM_HINT_T0);
+  }
+
+
+  
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
diff --git a/lib/simd/Grid_neon.h b/Grid/simd/Grid_neon.h
similarity index 100%
rename from lib/simd/Grid_neon.h
rename to Grid/simd/Grid_neon.h
diff --git a/lib/simd/Grid_qpx.h b/Grid/simd/Grid_qpx.h
similarity index 100%
rename from lib/simd/Grid_qpx.h
rename to Grid/simd/Grid_qpx.h
diff --git a/lib/simd/Grid_sse4.h b/Grid/simd/Grid_sse4.h
similarity index 100%
rename from lib/simd/Grid_sse4.h
rename to Grid/simd/Grid_sse4.h
diff --git a/lib/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h
similarity index 100%
rename from lib/simd/Grid_vector_types.h
rename to Grid/simd/Grid_vector_types.h
diff --git a/lib/simd/Grid_vector_unops.h b/Grid/simd/Grid_vector_unops.h
similarity index 100%
rename from lib/simd/Grid_vector_unops.h
rename to Grid/simd/Grid_vector_unops.h
diff --git a/lib/simd/IBM_qpx.h b/Grid/simd/IBM_qpx.h
similarity index 100%
rename from lib/simd/IBM_qpx.h
rename to Grid/simd/IBM_qpx.h
diff --git a/lib/simd/IBM_qpx_double.h b/Grid/simd/IBM_qpx_double.h
similarity index 100%
rename from lib/simd/IBM_qpx_double.h
rename to Grid/simd/IBM_qpx_double.h
diff --git a/lib/simd/IBM_qpx_single.h b/Grid/simd/IBM_qpx_single.h
similarity index 100%
rename from lib/simd/IBM_qpx_single.h
rename to Grid/simd/IBM_qpx_single.h
diff --git a/lib/simd/Intel512avx.h b/Grid/simd/Intel512avx.h
similarity index 86%
rename from lib/simd/Intel512avx.h
rename to Grid/simd/Intel512avx.h
index 9cded194..def37b9b 100644
--- a/lib/simd/Intel512avx.h
+++ b/Grid/simd/Intel512avx.h
@@ -1,4 +1,4 @@
-/*************************************************************************************
+    /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -23,8 +23,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
+    *************************************************************************************/
+    /*  END LEGAL */
 #ifndef GRID_ASM_AV512_H
 #define GRID_ASM_AV512_H
 
@@ -44,46 +44,46 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 
-#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
-  VSHUFMEMf(O,P,tmp)						\
-  VMULMEMf(O,P,B,Biirr)						\
-  VMULMEMf(O,P,C,Ciirr)						\
-  VMULf(tmp,B,Briir)						\
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
   VMULf(tmp,C,Criir)
 
-#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
-  VSHUFMEMd(O,P,tmp)						\
-  VMULMEMd(O,P,B,Biirr)						\
-  VMULMEMd(O,P,C,Ciirr)						\
-  VMULd(tmp,B,Briir)						\
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
   VMULd(tmp,C,Criir) 
 
-#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
-  VSHUFMEMf(O,P,tmp)						\
-  VMADDMEMf(O,P,B,Biirr)					\
-  VMADDMEMf(O,P,C,Ciirr)					\
-  VMADDf(tmp,B,Briir)						\
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
   VMADDf(tmp,C,Criir)
 
 #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
-  VSHUFMEMd(O,P,tmp)						\
-  VMADDMEMd(O,P,B,Biirr)					\
-  VMADDMEMd(O,P,C,Ciirr)					\
-  VMADDd(tmp,B,Briir)						\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
   VMADDd(tmp,C,Criir)
 
 // Merges accumulation for complex dot chain; less efficient under avx512
-#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n" \
-  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 
-#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n" \
-  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
+#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
 
-#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
-"vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 
-#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n" \
-  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
+#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
 
 #define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
@@ -123,10 +123,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
-/*
- * TimesI is used only in the XP recon
- * Could zero the regs and use RECON_ACCUM
- */
+  /*
+   * TimesI is used only in the XP recon
+   * Could zero the regs and use RECON_ACCUM
+   */
 
 #define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
 #define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
diff --git a/lib/simd/Intel512common.h b/Grid/simd/Intel512common.h
similarity index 100%
rename from lib/simd/Intel512common.h
rename to Grid/simd/Intel512common.h
diff --git a/lib/simd/Intel512double.h b/Grid/simd/Intel512double.h
similarity index 100%
rename from lib/simd/Intel512double.h
rename to Grid/simd/Intel512double.h
diff --git a/lib/simd/Intel512imci.h b/Grid/simd/Intel512imci.h
similarity index 100%
rename from lib/simd/Intel512imci.h
rename to Grid/simd/Intel512imci.h
diff --git a/lib/simd/Intel512single.h b/Grid/simd/Intel512single.h
similarity index 100%
rename from lib/simd/Intel512single.h
rename to Grid/simd/Intel512single.h
diff --git a/lib/simd/Intel512wilson.h b/Grid/simd/Intel512wilson.h
similarity index 100%
rename from lib/simd/Intel512wilson.h
rename to Grid/simd/Intel512wilson.h
diff --git a/lib/simd/Simd.h b/Grid/simd/Simd.h
similarity index 100%
rename from lib/simd/Simd.h
rename to Grid/simd/Simd.h
diff --git a/lib/simd/l1p.h b/Grid/simd/l1p.h
similarity index 100%
rename from lib/simd/l1p.h
rename to Grid/simd/l1p.h
diff --git a/lib/sitmo_rng/README b/Grid/sitmo_rng/README
similarity index 100%
rename from lib/sitmo_rng/README
rename to Grid/sitmo_rng/README
diff --git a/lib/sitmo_rng/sitmo_prng_engine.hpp b/Grid/sitmo_rng/sitmo_prng_engine.hpp
similarity index 100%
rename from lib/sitmo_rng/sitmo_prng_engine.hpp
rename to Grid/sitmo_rng/sitmo_prng_engine.hpp
diff --git a/lib/stencil/Lebesgue.cc b/Grid/stencil/Lebesgue.cc
similarity index 100%
rename from lib/stencil/Lebesgue.cc
rename to Grid/stencil/Lebesgue.cc
diff --git a/lib/stencil/Lebesgue.h b/Grid/stencil/Lebesgue.h
similarity index 100%
rename from lib/stencil/Lebesgue.h
rename to Grid/stencil/Lebesgue.h
diff --git a/lib/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h
similarity index 100%
rename from lib/stencil/SimpleCompressor.h
rename to Grid/stencil/SimpleCompressor.h
diff --git a/lib/stencil/Stencil.cc b/Grid/stencil/Stencil.cc
similarity index 100%
rename from lib/stencil/Stencil.cc
rename to Grid/stencil/Stencil.cc
diff --git a/lib/stencil/Stencil.h b/Grid/stencil/Stencil.h
similarity index 96%
rename from lib/stencil/Stencil.h
rename to Grid/stencil/Stencil.h
index 9beb707e..47deeae7 100644
--- a/lib/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -70,6 +70,8 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
   accelerator_loopN( i,num, {
       compress.Compress(&buffer[off],table_v[i].first,rhs_v[so+table_v[i].second]);
   });
+// Further optimisatoin: i) streaming store the result
+//                       ii) software prefetch the first element of the next table entry
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -214,7 +216,7 @@ public:
 
   int face_table_computed;
   std::vector<Vector<std::pair<int,int> > > face_table ;
-
+  std::vector<int> surface_list;
 
   Vector<StencilEntry>  _entries; // Resident in managed memory
   std::vector<Packet> Packets;
@@ -263,7 +265,6 @@ public:
 
     int dimension    = this->_directions[point];
     int displacement = this->_distances[point];
-    assert( (displacement==1) || (displacement==-1));
 
     int pd              = _grid->_processors[dimension];
     int fd              = _grid->_fdimensions[dimension];
@@ -278,9 +279,12 @@ public:
     if ( ! comm_dim ) return 1;
 
     int nbr_proc;
-    if (displacement==1) nbr_proc = 1;
+    if (displacement>0) nbr_proc = 1;
     else                 nbr_proc = pd-1;
 
+    // FIXME  this logic needs to be sorted for three link term
+    //    assert( (displacement==1) || (displacement==-1));
+    // Present hack only works for >= 4^4 subvol per node
     _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 
     void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
@@ -536,25 +540,24 @@ public:
   template<class decompressor>
   void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd) { 
 
-    for(int i=0;i<mm.size();i++){	
       mergetime-=usecond();
-      accelerator_loopN( o,mm[i].buffer_size/2, {
+    for(int i=0;i<mm.size();i++){	
+      thread_loop( (int o=0;o<mm[i].buffer_size/2;o++),{
 	decompress.Exchange(mm[i].mpointer,
 			    mm[i].vpointers[0],
 			    mm[i].vpointers[1],
 			    mm[i].type,o);
       });
-      mergetime+=usecond();
     }
+    mergetime+=usecond();
 
+    decompresstime-=usecond();
     for(int i=0;i<dd.size();i++){	
-      decompresstime-=usecond();
-      accelerator_loopN( o,dd[i].buffer_size, {
+      thread_loop( (int o=0;o<dd[i].buffer_size;o++),{
 	decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
       });
-      decompresstime+=usecond();
     }
-
+    decompresstime+=usecond();
   }
   ////////////////////////////////////////
   // Set up routines
@@ -569,6 +572,29 @@ public:
     }
   };
 
+  // Move interior/exterior split into the generic stencil
+  // FIXME Explicit Ls in interface is a pain. Should just use a vol
+  void BuildSurfaceList(int Ls,int vol4){
+
+    // find same node for SHM
+    // Here we know the distance is 1 for WilsonStencil
+    for(int point=0;point<this->_npoints;point++){
+      this->same_node[point] = this->SameNode(point);
+    }
+    
+    for(int site = 0 ;site< vol4;site++){
+      int local = 1;
+      for(int point=0;point<this->_npoints;point++){
+	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ 
+	  local = 0;
+	}
+      }
+      if(local == 0) { 
+	surface_list.push_back(site);
+      }
+    }
+  }
+
   CartesianStencil(GridBase *grid,
 		   int npoints,
 		   int checkerboard,
@@ -591,8 +617,10 @@ public:
     this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels
     this->_directions = StencilVector(directions);
     this->_distances  = StencilVector(distances);
+    this->same_node.resize(npoints);
 
     _unified_buffer_size=0;
+    surface_list.resize(0);
 
     int osites  = _grid->oSites();
     
diff --git a/lib/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h
similarity index 100%
rename from lib/tensors/Tensor_Ta.h
rename to Grid/tensors/Tensor_Ta.h
diff --git a/lib/tensors/Tensor_arith.h b/Grid/tensors/Tensor_arith.h
similarity index 100%
rename from lib/tensors/Tensor_arith.h
rename to Grid/tensors/Tensor_arith.h
diff --git a/lib/tensors/Tensor_arith_add.h b/Grid/tensors/Tensor_arith_add.h
similarity index 100%
rename from lib/tensors/Tensor_arith_add.h
rename to Grid/tensors/Tensor_arith_add.h
diff --git a/lib/tensors/Tensor_arith_mac.h b/Grid/tensors/Tensor_arith_mac.h
similarity index 100%
rename from lib/tensors/Tensor_arith_mac.h
rename to Grid/tensors/Tensor_arith_mac.h
diff --git a/lib/tensors/Tensor_arith_mul.h b/Grid/tensors/Tensor_arith_mul.h
similarity index 100%
rename from lib/tensors/Tensor_arith_mul.h
rename to Grid/tensors/Tensor_arith_mul.h
diff --git a/lib/tensors/Tensor_arith_scalar.h b/Grid/tensors/Tensor_arith_scalar.h
similarity index 100%
rename from lib/tensors/Tensor_arith_scalar.h
rename to Grid/tensors/Tensor_arith_scalar.h
diff --git a/lib/tensors/Tensor_arith_sub.h b/Grid/tensors/Tensor_arith_sub.h
similarity index 100%
rename from lib/tensors/Tensor_arith_sub.h
rename to Grid/tensors/Tensor_arith_sub.h
diff --git a/lib/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h
similarity index 100%
rename from lib/tensors/Tensor_class.h
rename to Grid/tensors/Tensor_class.h
diff --git a/lib/tensors/Tensor_determinant.h b/Grid/tensors/Tensor_determinant.h
similarity index 100%
rename from lib/tensors/Tensor_determinant.h
rename to Grid/tensors/Tensor_determinant.h
diff --git a/lib/tensors/Tensor_exp.h b/Grid/tensors/Tensor_exp.h
similarity index 100%
rename from lib/tensors/Tensor_exp.h
rename to Grid/tensors/Tensor_exp.h
diff --git a/lib/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h
similarity index 100%
rename from lib/tensors/Tensor_extract_merge.h
rename to Grid/tensors/Tensor_extract_merge.h
diff --git a/lib/tensors/Tensor_index.h b/Grid/tensors/Tensor_index.h
similarity index 100%
rename from lib/tensors/Tensor_index.h
rename to Grid/tensors/Tensor_index.h
diff --git a/lib/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h
similarity index 100%
rename from lib/tensors/Tensor_inner.h
rename to Grid/tensors/Tensor_inner.h
diff --git a/lib/tensors/Tensor_logical.h b/Grid/tensors/Tensor_logical.h
similarity index 75%
rename from lib/tensors/Tensor_logical.h
rename to Grid/tensors/Tensor_logical.h
index 9013fca8..e0438d1b 100644
--- a/lib/tensors/Tensor_logical.h
+++ b/Grid/tensors/Tensor_logical.h
@@ -25,8 +25,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_TENSOR_LOGICAL_H
-#define GRID_TENSOR_LOGICAL_H
+#pragma once
 
 NAMESPACE_BEGIN(Grid);
 
@@ -55,6 +54,39 @@ LOGICAL_BINOP(&);
 LOGICAL_BINOP(||);
 LOGICAL_BINOP(&&);
 
+template <class T>
+strong_inline bool operator==(const iScalar<T> &t1, const iScalar<T> &t2)
+{
+  return (t1._internal == t2._internal);
+}
+
+template <class T, int N>
+strong_inline bool operator==(const iVector<T, N> &t1, const iVector<T, N> &t2)
+{
+  bool res = true;
+
+  for (unsigned int i = 0; i < N; ++i)
+  {
+    res = (res && (t1._internal[i] == t2._internal[i]));
+  }
+
+  return res;
+}
+
+template <class T, int N>
+strong_inline bool operator==(const iMatrix<T, N> &t1, const iMatrix<T, N> &t2)
+{
+  bool res = true;
+
+  for (unsigned int i = 0; i < N; ++i)
+  for (unsigned int j = 0; j < N; ++j)
+  {
+    res = (res && (t1._internal[i][j] == t2._internal[i][j]));
+  }
+  
+  return res;
+}
+
 NAMESPACE_END(Grid);
 
-#endif
+
diff --git a/lib/tensors/Tensor_outer.h b/Grid/tensors/Tensor_outer.h
similarity index 100%
rename from lib/tensors/Tensor_outer.h
rename to Grid/tensors/Tensor_outer.h
diff --git a/lib/tensors/Tensor_reality.h b/Grid/tensors/Tensor_reality.h
similarity index 100%
rename from lib/tensors/Tensor_reality.h
rename to Grid/tensors/Tensor_reality.h
diff --git a/lib/tensors/Tensor_trace.h b/Grid/tensors/Tensor_trace.h
similarity index 100%
rename from lib/tensors/Tensor_trace.h
rename to Grid/tensors/Tensor_trace.h
diff --git a/lib/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h
similarity index 100%
rename from lib/tensors/Tensor_traits.h
rename to Grid/tensors/Tensor_traits.h
diff --git a/lib/tensors/Tensor_transpose.h b/Grid/tensors/Tensor_transpose.h
similarity index 100%
rename from lib/tensors/Tensor_transpose.h
rename to Grid/tensors/Tensor_transpose.h
diff --git a/lib/tensors/Tensor_unary.h b/Grid/tensors/Tensor_unary.h
similarity index 100%
rename from lib/tensors/Tensor_unary.h
rename to Grid/tensors/Tensor_unary.h
diff --git a/lib/tensors/Tensors.h b/Grid/tensors/Tensors.h
similarity index 100%
rename from lib/tensors/Tensors.h
rename to Grid/tensors/Tensors.h
diff --git a/lib/threads/Pragmas.h b/Grid/threads/Pragmas.h
similarity index 70%
rename from lib/threads/Pragmas.h
rename to Grid/threads/Pragmas.h
index 9ff15054..0d1b2e99 100644
--- a/lib/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -44,28 +44,61 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define GRID_NVCC
 #endif
 
+#define UNROLL  _Pragma("unroll")
 
+//////////////////////////////////////////////////////////////////////////////////
+// Old primitives; shortly deprecate
+//////////////////////////////////////////////////////////////////////////////////
+
+#ifdef GRID_OMP
+#define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
+#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
+#define PARALLEL_NESTED_LOOP5 _Pragma("omp parallel for collapse(5)")
+#define PARALLEL_REGION       _Pragma("omp parallel")
+#define PARALLEL_CRITICAL     _Pragma("omp critical")
+#else
+#define PARALLEL_FOR_LOOP
+#define PARALLEL_FOR_LOOP_INTERN
+#define PARALLEL_FOR_LOOP_REDUCE(op, var)
+#define PARALLEL_NESTED_LOOP2
+#define PARALLEL_NESTED_LOOP5
+#define PARALLEL_REGION
+#define PARALLEL_CRITICAL
+#endif
+
+#define parallel_region    PARALLEL_REGION
+#define parallel_for       PARALLEL_FOR_LOOP for
+#define parallel_for_internal PARALLEL_FOR_LOOP_INTERN for
+#define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
+#define parallel_for_nest5 PARALLEL_NESTED_LOOP5 for
+#define parallel_critical PARALLEL_CRITICAL
 
 //////////////////////////////////////////////////////////////////////////////////
 // New primitives; explicit host thread calls, and accelerator data parallel calls
 //////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_OMP
-#define thread_loop( range , ... )           _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; };
-#define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)")          for range  { __VA_ARGS__ ; };
-#define thread_loop_collapse2( range , ... )  _Pragma("omp parallel for collapse(2)")     for range  { __VA_ARGS__ };
-#define thread_loop_collapse3( range , ... )  _Pragma("omp parallel for collapse(3)")     for range  { __VA_ARGS__ };
-#define thread_loop_collapse4( range , ... )  _Pragma("omp parallel for collapse(4)")     for range  { __VA_ARGS__ };
-#define thread_region                         _Pragma("omp parallel")
-#define thread_critical                       _Pragma("omp critical")
+
+#define DO_PRAGMA_(x) _Pragma (#x)
+#define DO_PRAGMA(x) DO_PRAGMA_(x)
+
+#define thread_loop( range , ... )            DO_PRAGMA(omp parallel for schedule(static))   for range { __VA_ARGS__ ; };
+#define thread_loop_in_region( range , ... )  DO_PRAGMA(omp for schedule(static))            for range  { __VA_ARGS__ ; };
+#define thread_loop_collapse2( range , ... )  DO_PRAGMA(omp parallel for collapse(2))        for range  { __VA_ARGS__ };
+#define thread_loop_collapse( N , range , ... )  DO_PRAGMA(omp parallel for collapse ( N ) ) for range  { __VA_ARGS__ };
+#define thread_loop_collapse_in_region( N , range , ... )  DO_PRAGMA(omp for collapse ( N )) for range  { __VA_ARGS__ };
+#define thread_region                         DO_PRAGMA("omp parallel")
+#define thread_critical                       DO_PRAGMA("omp critical")
 #define thread_num(a) omp_get_thread_num()
 #define thread_max(a) omp_get_max_threads()
 #else
-#define thread_loop( range , ... )            for range { __VA_ARGS__ ; };
-#define thread_loop_in_region( range , ... )  for range { __VA_ARGS__ ; };
-#define thread_loop_collapse2( range , ... )  for range { __VA_ARGS__ ; };
-#define thread_loop_collapse3( range , ... )  for range { __VA_ARGS__ ; };
-#define thread_loop_collapse4( range , ... )  for range { __VA_ARGS__ ; };
+#define thread_loop( range , ... )                for range { __VA_ARGS__ ; };
+#define thread_loop_collapse2( range , ... )      for range { __VA_ARGS__ ; };
+#define thread_loop_collapse ( N , range , ... )  for range { __VA_ARGS__ ; };
 #define thread_region                           
+#define thread_loop_in_region( range , ... )  for range { __VA_ARGS__ ; };
+#define thread_loop_collapse_in_region( N, range , ... ) for range  { __VA_ARGS__ ; };
+
 #define thread_critical                         
 #define thread_num(a) (0)
 #define thread_max(a) (1)
diff --git a/lib/threads/Threads.h b/Grid/threads/Threads.h
similarity index 97%
rename from lib/threads/Threads.h
rename to Grid/threads/Threads.h
index 6699f476..29cae060 100644
--- a/lib/threads/Threads.h
+++ b/Grid/threads/Threads.h
@@ -26,13 +26,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_THREADS_H
-#define GRID_THREADS_H
+#pragma once 
 
-NAMESPACE_BEGIN(Grid);
 
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
+NAMESPACE_BEGIN(Grid);
 
 class GridThread {
 public:
@@ -57,7 +56,6 @@ public:
   };
   static void SetMaxThreads(void) { 
 #ifdef GRID_OMP
-    //    setenv("KMP_AFFINITY","balanced",1);
     _threads = omp_get_max_threads();
     omp_set_num_threads(_threads);
 #else 
@@ -127,4 +125,4 @@ public:
 };
 
 NAMESPACE_END(Grid);
-#endif
+
diff --git a/lib/util/CompilerCompatible.h b/Grid/util/CompilerCompatible.h
similarity index 100%
rename from lib/util/CompilerCompatible.h
rename to Grid/util/CompilerCompatible.h
diff --git a/lib/util/Coordinate.h b/Grid/util/Coordinate.h
similarity index 95%
rename from lib/util/Coordinate.h
rename to Grid/util/Coordinate.h
index 72df6a7c..05c97e71 100644
--- a/lib/util/Coordinate.h
+++ b/Grid/util/Coordinate.h
@@ -62,8 +62,10 @@ public:
     _size = sz;
     for(int s=0;s<sz;s++) _data[s]=val;
   }
-  accelerator_inline pointer begin(void)                   { return _data; } 
+  accelerator_inline pointer begin(void)                   { return &_data[0]; } 
+  accelerator_inline const_pointer begin(void) const       { return &_data[0]; } 
   accelerator_inline pointer end  (void)                   { return &_data[_size]; } 
+  accelerator_inline const_pointer end  (void) const       { return &_data[_size]; } 
   accelerator_inline void push_back(const value &val)      { resize(_size+1); _data[_size-1] = val;}
   accelerator_inline AcceleratorVector()                   { _size = 0; }
   accelerator_inline AcceleratorVector(size_type sz)           { resize(sz); }
diff --git a/lib/util/Init.cc b/Grid/util/Init.cc
similarity index 98%
rename from lib/util/Init.cc
rename to Grid/util/Init.cc
index 79f9c737..3aac20a0 100644
--- a/lib/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -361,6 +361,7 @@ void Grid_init(int *argc,char ***argv)
     std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
     std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
     std::cout << "GNU General Public License for more details."<<std::endl;
+    printHash();
     std::cout << std::endl;
   }
 
@@ -445,8 +446,10 @@ void Grid_init(int *argc,char ***argv)
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
     WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
+    StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
   } else {
     WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
+    StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
@@ -462,6 +465,7 @@ void Grid_init(int *argc,char ***argv)
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
+    assert(CartesianCommunicator::nCommThreads > 0);
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
diff --git a/lib/util/Init.h b/Grid/util/Init.h
similarity index 92%
rename from lib/util/Init.h
rename to Grid/util/Init.h
index aff19b04..f7f032ba 100644
--- a/lib/util/Init.h
+++ b/Grid/util/Init.h
@@ -26,8 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#ifndef GRID_INIT_H
-#define GRID_INIT_H
+#pragma once
+
 
 NAMESPACE_BEGIN(Grid);
 
@@ -57,6 +57,14 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
 
+
+void GridParseLayout(char **argv,int argc,
+		     std::vector<int> &latt,
+		     std::vector<int> &simd,
+		     std::vector<int> &mpi);
+
+void printHash(void);
+
+
 NAMESPACE_END(Grid);
 
-#endif
diff --git a/lib/util/Lexicographic.h b/Grid/util/Lexicographic.h
similarity index 100%
rename from lib/util/Lexicographic.h
rename to Grid/util/Lexicographic.h
diff --git a/Grid/util/Profiling.h b/Grid/util/Profiling.h
new file mode 100644
index 00000000..acdcb0c6
--- /dev/null
+++ b/Grid/util/Profiling.h
@@ -0,0 +1,72 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/util/Profiling.h
+
+    Copyright (C) 2018
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#ifndef GRID_PERF_PROFILING_H
+#define GRID_PERF_PROFILING_H
+
+#include <sstream>
+#include <iostream>
+#include <functional>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
+struct System
+{
+    static void profile(const std::string& name,std::function<void()> body) {
+        std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name;
+
+        // Launch profiler
+        pid_t pid;
+        std::stringstream s;
+        s << getpid();
+        pid = fork();
+        if (pid == 0) {
+            auto fd=open("/dev/null",O_RDWR);
+            dup2(fd,1);
+            dup2(fd,2);
+            exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr));
+        }
+
+        // Run body
+        body();
+
+        // Kill profiler  
+        kill(pid,SIGINT);
+        waitpid(pid,nullptr,0);
+    }
+
+    static void profile(std::function<void()> body) {
+        profile("perf.data",body);
+    }
+};
+
+#endif // GRID_PERF_PROFILING_H
\ No newline at end of file
diff --git a/Grid/util/Sha.h b/Grid/util/Sha.h
new file mode 100644
index 00000000..ee164c34
--- /dev/null
+++ b/Grid/util/Sha.h
@@ -0,0 +1,99 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/util/Sha.h
+
+    Copyright (C) 2018
+
+    Author: Peter Boyle
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+extern "C" {
+#include <openssl/sha.h>
+}
+#ifdef USE_IPP
+#include "ipp.h"
+#endif
+
+#pragma once
+
+class GridChecksum
+{
+public:
+  static inline uint32_t crc32(const void *data, size_t bytes)
+  {
+    return ::crc32(0L,(unsigned char *)data,bytes);
+  }
+
+#ifdef USE_IPP
+  static inline uint32_t crc32c(const void* data, size_t bytes)
+  {
+      uint32_t crc32c = ~(uint32_t)0;
+      ippsCRC32C_8u(reinterpret_cast<const unsigned char *>(data), bytes, &crc32c);
+      ippsSwapBytes_32u_I(&crc32c, 1);
+  
+      return ~crc32c;
+  }
+#endif
+
+  template <typename T>
+  static inline std::string sha256_string(const std::vector<T> &hash)
+  {
+    std::stringstream sha;
+    std::string       s;
+
+    for(unsigned int i = 0; i < hash.size(); i++) 
+    { 
+        sha << std::hex << static_cast<unsigned int>(hash[i]);
+    }
+    s = sha.str();
+
+    return s;
+  }
+  static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
+  {
+    std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
+    SHA256_CTX sha256;
+    SHA256_Init  (&sha256);
+    SHA256_Update(&sha256, data,bytes);
+    SHA256_Final (&hash[0], &sha256);
+    return hash;
+  }
+  static inline std::vector<int> sha256_seeds(const std::string &s)
+  {
+    std::vector<int> seeds;
+    std::vector<unsigned char> uchars = sha256((void *)s.c_str(),s.size());
+    for(int i=0;i<uchars.size();i++) seeds.push_back(uchars[i]);
+    return seeds;
+  }
+};
+
+/*
+int main(int argc,char **argv)
+{
+  std::string s("The quick brown fox jumps over the lazy dog");
+  auto csum = GridChecksum::sha256_seeds(s);
+  std::cout << "SHA256 sum is 0x";
+  for(int i=0;i<csum.size;i++) { 
+    std::cout << std::hex << csum[i];
+  }
+  std::cout << std::endl;
+}
+*/
diff --git a/lib/util/Util.h b/Grid/util/Util.h
similarity index 100%
rename from lib/util/Util.h
rename to Grid/util/Util.h
diff --git a/Grid/util/version.cc b/Grid/util/version.cc
new file mode 100644
index 00000000..c9507137
--- /dev/null
+++ b/Grid/util/version.cc
@@ -0,0 +1,12 @@
+#include <iostream>
+#include "Version.h"
+namespace Grid {
+  void printHash(){
+#ifdef GITHASH
+    std::cout << "Current Grid git commit hash=" << GITHASH << std::endl;
+#else
+    std::cout << "Current Grid git commit hash is undefined. Check makefile." << std::endl;
+#endif
+#undef GITHASH
+}
+}
diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp
new file mode 100644
index 00000000..ed2f5d36
--- /dev/null
+++ b/Hadrons/A2AMatrix.hpp
@@ -0,0 +1,746 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/A2AMatrix.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef A2A_Matrix_hpp_
+#define A2A_Matrix_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+#ifndef HADRONS_A2AM_NAME 
+#define HADRONS_A2AM_NAME "a2aMatrix"
+#endif
+
+#ifndef HADRONS_A2AM_IO_TYPE
+#define HADRONS_A2AM_IO_TYPE ComplexF
+#endif
+
+#define HADRONS_A2AM_PARALLEL_IO
+
+BEGIN_HADRONS_NAMESPACE
+
+// general A2A matrix set based on Eigen tensors and Grid-allocated memory
+// Dimensions:
+//   0 - ext - external field (momentum, EM field, ...)
+//   1 - str - spin-color structure
+//   2 - t   - timeslice
+//   3 - i   - left  A2A mode index
+//   4 - j   - right A2A mode index
+template <typename T>
+using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
+
+template <typename T>
+using A2AMatrix = Eigen::Matrix<T, -1, -1, Eigen::RowMajor>;
+
+template <typename T>
+using A2AMatrixTr = Eigen::Matrix<T, -1, -1, Eigen::ColMajor>;
+
+/******************************************************************************
+ *                      Abstract class for A2A kernels                        *
+ ******************************************************************************/
+template <typename T, typename Field>
+class A2AKernel
+{
+public:
+    A2AKernel(void) = default;
+    virtual ~A2AKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
+                          const unsigned int orthogDim, double &time) = 0;
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
+};
+
+/******************************************************************************
+ *                  Class to handle A2A matrix block HDF5 I/O                 *
+ ******************************************************************************/
+template <typename T>
+class A2AMatrixIo
+{
+public:
+    // constructors
+    A2AMatrixIo(void) = default;
+    A2AMatrixIo(std::string filename, std::string dataname, 
+                const unsigned int nt, const unsigned int ni = 0,
+                const unsigned int nj = 0);
+    // destructor
+    ~A2AMatrixIo(void) = default;
+    // access
+    unsigned int getNi(void) const;
+    unsigned int getNj(void) const;
+    unsigned int getNt(void) const;
+    size_t       getSize(void) const;
+    // file allocation
+    template <typename MetadataType>
+    void initFile(const MetadataType &d, const unsigned int chunkSize);
+    // block I/O
+    void saveBlock(const T *data, const unsigned int i, const unsigned int j,
+                   const unsigned int blockSizei, const unsigned int blockSizej);
+    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
+                   const unsigned int i, const unsigned int j);
+    template <template <class> class Vec, typename VecT>
+    void load(Vec<VecT> &v, double *tRead = nullptr);
+private:
+    std::string  filename_{""}, dataname_{""};
+    unsigned int nt_{0}, ni_{0}, nj_{0};
+};
+
+/******************************************************************************
+ *                  Wrapper for A2A matrix block computation                  *
+ ******************************************************************************/
+template <typename T, typename Field, typename MetadataType, typename TIo = T>
+class A2AMatrixBlockComputation
+{
+private:
+    struct IoHelper
+    {
+        A2AMatrixIo<TIo> io;
+        MetadataType     md;
+        unsigned int     e, s, i, j;
+    };
+    typedef std::function<std::string(const unsigned int, const unsigned int)>  FilenameFn;
+    typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
+public:
+    // constructor
+    A2AMatrixBlockComputation(GridBase *grid,
+                              const unsigned int orthogDim,
+                              const unsigned int next,
+                              const unsigned int nstr,
+                              const unsigned int blockSize,
+                              const unsigned int cacheBlockSize,
+                              TimerArray *tArray = nullptr);
+    // execution
+    void execute(const std::vector<Field> &left, 
+                 const std::vector<Field> &right,
+                 A2AKernel<T, Field> &kernel,
+                 const FilenameFn &ionameFn,
+                 const FilenameFn &filenameFn,
+                 const MetadataFn &metadataFn);
+private:
+    // I/O handler
+    void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
+private:
+    TimerArray            *tArray_;
+    GridBase              *grid_;
+    unsigned int          orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
+    Vector<T>             mCache_;
+    Vector<TIo>           mBuf_;
+    std::vector<IoHelper> nodeIo_;
+};
+
+/******************************************************************************
+ *                       A2A matrix contraction kernels                       *
+ ******************************************************************************/
+class A2AContraction
+{
+public:
+    // accTrMul(acc, a, b): acc += tr(a*b)
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void accTrMul(C &acc, const MatLeft &a, const MatRight &b)
+    {
+        if ((MatLeft::Options == Eigen::RowMajor) and
+            (MatRight::Options == Eigen::ColMajor))
+        {
+            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+            {
+                C tmp;
+#ifdef USE_MKL
+                dotuRow(tmp, r, a, b);
+#else
+                tmp = a.row(r).conjugate().dot(b.col(r));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+        else
+        {
+            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+            {
+                C tmp;
+#ifdef USE_MKL 
+                dotuCol(tmp, c, a, b);
+#else
+                tmp = a.col(c).conjugate().dot(b.row(c));
+#endif
+                parallel_critical
+                {
+                    acc += tmp;
+                }
+            }
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline double accTrMulFlops(const MatLeft &a, const MatRight &b)
+    {
+        double n = a.rows()*a.cols();
+
+        return 8.*n;
+    }
+
+    // mul(res, a, b): res = a*b
+#ifdef USE_MKL
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexD, Opts...> &res, 
+                           const Mat<ComplexD, Opts...> &a, 
+                           const Mat<ComplexD, Opts...> &b)
+    {
+        static const ComplexD one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexD, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexD, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+
+    template <template <class, int...> class Mat, int... Opts>
+    static inline void mul(Mat<ComplexF, Opts...> &res, 
+                           const Mat<ComplexF, Opts...> &a, 
+                           const Mat<ComplexF, Opts...> &b)
+    {
+        static const ComplexF one(1., 0.), zero(0., 0.);
+
+        if ((res.rows() != a.rows()) or (res.cols() != b.cols()))
+        {
+            res.resize(a.rows(), b.cols());
+        }
+        if (Mat<ComplexF, Opts...>::Options == Eigen::RowMajor)
+        {
+            cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat<ComplexF, Opts...>::Options == Eigen::ColMajor)
+        {
+            cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    }
+#else
+    template <typename Mat>
+    static inline void mul(Mat &res, const Mat &a, const Mat &b)
+    {
+        res = a*b;
+    }
+#endif
+    template <typename Mat>
+    static inline double mulFlops(const Mat &a, const Mat &b)
+    {
+        double nr = a.rows(), nc = a.cols();
+
+        return nr*nr*(6.*nc + 2.*(nc - 1.));
+    }
+private:
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotRowPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aRow, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aRow*a.cols();
+            aInc = 1;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aRow;
+            aInc = a.rows();
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aRow;
+            bInc = b.cols();
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aRow*b.rows();
+            bInc = 1;
+        }
+    }
+
+#ifdef USE_MKL
+    template <typename C, typename MatLeft, typename MatRight>
+    static inline void makeDotColPt(C * &aPt, unsigned int &aInc, C * &bPt, 
+                                    unsigned int &bInc, const unsigned int aCol, 
+                                    const MatLeft &a, const MatRight &b)
+    {
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            aPt  = a.data() + aCol;
+            aInc = a.cols();
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            aPt  = a.data() + aCol*a.rows();
+            aInc = 1;
+        }
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            bPt  = b.data() + aCol*b.cols();
+            bInc = 1;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            bPt  = b.data() + aCol;
+            bInc = b.rows();
+        }
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexF &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_cdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexF &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexF *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_cdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuRow(ComplexD &res, const unsigned int aRow,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotRowPt(aPt, aInc, bPt, bInc, aRow, a, b);
+        cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+    }
+
+    template <typename MatLeft, typename MatRight>
+    static inline void dotuCol(ComplexD &res, const unsigned int aCol,
+                               const MatLeft &a, const MatRight &b)
+    {
+        const ComplexD *aPt, *bPt;
+        unsigned int   aInc, bInc;
+
+        makeDotColPt(aPt, aInc, bPt, bInc, aCol, a, b);
+        cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+    }
+#endif
+};
+
+/******************************************************************************
+ *                     A2AMatrixIo template implementation                    *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T>
+A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname, 
+                            const unsigned int nt, const unsigned int ni,
+                            const unsigned int nj)
+: filename_(filename), dataname_(dataname)
+, nt_(nt), ni_(ni), nj_(nj)
+{}
+
+// access //////////////////////////////////////////////////////////////////////
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNt(void) const
+{
+    return nt_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNi(void) const
+{
+    return ni_;
+}
+
+template <typename T>
+unsigned int A2AMatrixIo<T>::getNj(void) const
+{
+    return nj_;
+}
+
+template <typename T>
+size_t A2AMatrixIo<T>::getSize(void) const
+{
+    return nt_*ni_*nj_*sizeof(T);
+}
+
+// file allocation /////////////////////////////////////////////////////////////
+template <typename T>
+template <typename MetadataType>
+void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
+{
+#ifdef HAVE_HDF5
+    std::vector<hsize_t>    dim = {static_cast<hsize_t>(nt_), 
+                                   static_cast<hsize_t>(ni_), 
+                                   static_cast<hsize_t>(nj_)},
+                            chunk = {static_cast<hsize_t>(nt_), 
+                                     static_cast<hsize_t>(chunkSize), 
+                                     static_cast<hsize_t>(chunkSize)};
+    H5NS::DataSpace         dataspace(dim.size(), dim.data());
+    H5NS::DataSet           dataset;
+    H5NS::DSetCreatPropList plist;
+    
+    // create empty file just with metadata
+    {
+        Hdf5Writer writer(filename_);
+        write(writer, dataname_, d);
+    }
+
+    // create the dataset
+    Hdf5Reader reader(filename_, false);
+
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    plist.setChunk(chunk.size(), chunk.data());
+    plist.setFletcher32();
+    dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
+// block I/O ///////////////////////////////////////////////////////////////////
+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const T *data, 
+                               const unsigned int i, 
+                               const unsigned int j,
+                               const unsigned int blockSizei,
+                               const unsigned int blockSizej)
+{
+#ifdef HAVE_HDF5
+    Hdf5Reader           reader(filename_, false);
+    std::vector<hsize_t> count = {nt_, blockSizei, blockSizej},
+                         offset = {0, static_cast<hsize_t>(i),
+                                   static_cast<hsize_t>(j)},
+                         stride = {1, 1, 1},
+                         block  = {1, 1, 1}; 
+    H5NS::DataSpace      memspace(count.size(), count.data()), dataspace;
+    H5NS::DataSet        dataset;
+    size_t               shift;
+
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    dataspace   = dataset.getSpace();
+    dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                              stride.data(), block.data());
+    dataset.write(data, Hdf5Type<T>::type(), memspace, dataspace);
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
+template <typename T>
+void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
+                               const unsigned int ext, const unsigned int str,
+                               const unsigned int i, const unsigned int j)
+{
+    unsigned int blockSizei = m.dimension(3);
+    unsigned int blockSizej = m.dimension(4);
+    unsigned int nstr       = m.dimension(1);
+    size_t       offset     = (ext*nstr + str)*nt_*blockSizei*blockSizej;
+
+    saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
+}
+
+template <typename T>
+template <template <class> class Vec, typename VecT>
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
+{
+#ifdef HAVE_HDF5
+    Hdf5Reader           reader(filename_);
+    std::vector<hsize_t> hdim;
+    H5NS::DataSet        dataset;
+    H5NS::DataSpace      dataspace;
+    H5NS::CompType       datatype;
+    
+    push(reader, dataname_);
+    auto &group = reader.getGroup();
+    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
+    datatype    = dataset.getCompType();
+    dataspace   = dataset.getSpace();
+    hdim.resize(dataspace.getSimpleExtentNdims());
+    dataspace.getSimpleExtentDims(hdim.data());
+    if ((nt_*ni_*nj_ != 0) and
+        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+    {
+        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+            + std::to_string(hdim[2]) + ", expected "
+            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+            + std::to_string(nj_));
+    }
+    else if (ni_*nj_ == 0)
+    {
+        if (hdim[0] != nt_)
+        {
+            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                + std::to_string(hdim[0]) + ", expected "
+                + std::to_string(nt_) + ")");
+        }
+        ni_ = hdim[1];
+        nj_ = hdim[2];
+    }
+
+    A2AMatrix<T>         buf(ni_, nj_);
+    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)},
+                         stride   = {1, 1, 1},
+                         block    = {1, 1, 1},
+                         memCount = {static_cast<hsize_t>(ni_),
+                                     static_cast<hsize_t>(nj_)};
+    H5NS::DataSpace      memspace(memCount.size(), memCount.data());
+
+    std::cout << "Loading timeslice";
+    std::cout.flush();
+    *tRead = 0.;
+    for (unsigned int tp1 = nt_; tp1 > 0; --tp1)
+    {
+        unsigned int         t      = tp1 - 1;
+        std::vector<hsize_t> offset = {static_cast<hsize_t>(t), 0, 0};
+        
+        if (t % 10 == 0)
+        {
+            std::cout << " " << t;
+            std::cout.flush();
+        }
+        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                  stride.data(), block.data());
+        if (tRead) *tRead -= usecond();    
+        dataset.read(buf.data(), datatype, memspace, dataspace);
+        if (tRead) *tRead += usecond();
+        v[t] = buf.template cast<VecT>();
+    }
+    std::cout << std::endl;
+#else
+    HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
+#endif
+}
+
+/******************************************************************************
+ *               A2AMatrixBlockComputation template implementation            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::A2AMatrixBlockComputation(GridBase *grid,
+                            const unsigned int orthogDim,
+                            const unsigned int next, 
+                            const unsigned int nstr,
+                            const unsigned int blockSize, 
+                            const unsigned int cacheBlockSize,
+                            TimerArray *tArray)
+: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
+, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
+, tArray_(tArray)
+{
+    mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
+    mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
+}
+
+#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
+#define STOP_TIMER(name)  if (tArray_) tArray_->stopTimer(name)
+#define GET_TIMER(name)   ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::execute(const std::vector<Field> &left, const std::vector<Field> &right,
+          A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
+          const FilenameFn &filenameFn, const MetadataFn &metadataFn)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // i,j   is first  loop over blockSize_ factors
+    // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions
+    // iii,jjj are loops within cacheBlock
+    // Total index is sum of these  i+ii+iii etc...
+    //////////////////////////////////////////////////////////////////////////
+    int    N_i = left.size();
+    int    N_j = right.size();
+    double flops, bytes, t_kernel;
+    double nodes = grid_->NodeCount();
+    
+    int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
+    int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
+
+    for(int i=0;i<N_i;i+=blockSize_)
+    for(int j=0;j<N_j;j+=blockSize_)
+    {
+        // Get the W and V vectors for this block^2 set of terms
+        int N_ii = MIN(N_i-i,blockSize_);
+        int N_jj = MIN(N_j-j,blockSize_);
+        A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
+
+        LOG(Message) << "All-to-all matrix block " 
+                     << j/blockSize_ + NBlock_j*i/blockSize_ + 1 
+                     << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " 
+                     << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" 
+                     << std::endl;
+        // Series of cache blocked chunks of the contractions within this block
+        flops    = 0.0;
+        bytes    = 0.0;
+        t_kernel = 0.0;
+        for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
+        for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
+        {
+            double t;
+            int N_iii = MIN(N_ii-ii,cacheBlockSize_);
+            int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
+            A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
+
+            START_TIMER("kernel");
+            kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
+            STOP_TIMER("kernel");
+            t_kernel += t;
+            flops    += kernel.flops(N_iii, N_jjj);
+            bytes    += kernel.bytes(N_iii, N_jjj);
+
+            START_TIMER("cache copy");
+            parallel_for_nest5(int e =0;e<next_;e++)
+            for(int s =0;s< nstr_;s++)
+            for(int t =0;t< nt_;t++)
+            for(int iii=0;iii< N_iii;iii++)
+            for(int jjj=0;jjj< N_jjj;jjj++)
+            {
+                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
+            }
+            STOP_TIMER("cache copy");
+        }
+
+        // perf
+        LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes 
+                     << " Gflop/s/node " << std::endl;
+        LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes 
+                     << " GB/s/node "  << std::endl;
+
+        // IO
+        double       blockSize, ioTime;
+        unsigned int myRank = grid_->ThisRank(), nRank  = grid_->RankCount();
+    
+        LOG(Message) << "Writing block to disk" << std::endl;
+        ioTime = -GET_TIMER("IO: write block");
+        START_TIMER("IO: total");
+        makeFileDir(filenameFn(0, 0), grid_);
+#ifdef HADRONS_A2AM_PARALLEL_IO
+        grid_->Barrier();
+        // make task list for current node
+        nodeIo_.clear();
+        for(int f = myRank; f < next_*nstr_; f += nRank)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = f/nstr_;
+            h.s  = f % nstr_;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            nodeIo_.push_back(h);
+        }
+        // parallel IO
+        for (auto &h: nodeIo_)
+        {
+            saveBlock(mBlock, h);
+        }
+        grid_->Barrier();
+#else
+        // serial IO, for testing purposes only
+        for(int e = 0; e < next_; e++)
+        for(int s = 0; s < nstr_; s++)
+        {
+            IoHelper h;
+
+            h.i  = i;
+            h.j  = j;
+            h.e  = e;
+            h.s  = s;
+            h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s), 
+                                    ionameFn(h.e, h.s), nt_, N_i, N_j);
+            h.md = metadataFn(h.e, h.s);
+            saveBlock(mfBlock, h);
+        }
+#endif
+        STOP_TIMER("IO: total");
+        blockSize  = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
+        ioTime    += GET_TIMER("IO: write block");
+        LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
+                     << ioTime  << " us (" 
+                     << blockSize/ioTime*1.0e6/1024/1024
+                     << " MB/s)" << std::endl;
+    }
+}
+
+// I/O handler /////////////////////////////////////////////////////////////////
+template <typename T, typename Field, typename MetadataType, typename TIo>
+void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
+::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
+{
+    if ((h.i == 0) and (h.j == 0))
+    {
+        START_TIMER("IO: file creation");
+        h.io.initFile(h.md, blockSize_);
+        STOP_TIMER("IO: file creation");
+    }
+    START_TIMER("IO: write block");
+    h.io.saveBlock(m, h.e, h.s, h.i, h.j);
+    STOP_TIMER("IO: write block");
+}
+
+#undef START_TIMER
+#undef STOP_TIMER
+#undef GET_TIMER
+
+END_HADRONS_NAMESPACE
+
+#endif // A2A_Matrix_hpp_
diff --git a/Hadrons/A2AVectors.hpp b/Hadrons/A2AVectors.hpp
new file mode 100644
index 00000000..99082fb5
--- /dev/null
+++ b/Hadrons/A2AVectors.hpp
@@ -0,0 +1,342 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/A2AVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: fionnoh <fionnoh@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef A2A_Vectors_hpp_
+#define A2A_Vectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Environment.hpp>
+#include <Hadrons/Solver.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                 Class to generate V & W all-to-all vectors                 *
+ ******************************************************************************/
+template <typename FImpl>
+class A2AVectorsSchurDiagTwo
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
+public:
+    A2AVectorsSchurDiagTwo(FMat &action, Solver &solver);
+    virtual ~A2AVectorsSchurDiagTwo(void) = default;
+    void makeLowModeV(FermionField &vout, 
+                      const FermionField &evec, const Real &eval);
+    void makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
+                        const FermionField &evec, const Real &eval);
+    void makeLowModeW(FermionField &wout, 
+                      const FermionField &evec, const Real &eval);
+    void makeLowModeW5D(FermionField &wout_4d, FermionField &wout_5d, 
+                        const FermionField &evec, const Real &eval);
+    void makeHighModeV(FermionField &vout, const FermionField &noise);
+    void makeHighModeV5D(FermionField &vout_4d, FermionField &vout_5d, 
+                         const FermionField &noise_5d);
+    void makeHighModeW(FermionField &wout, const FermionField &noise);
+    void makeHighModeW5D(FermionField &vout_5d, FermionField &wout_5d, 
+                         const FermionField &noise_5d);
+private:
+    FMat                                     &action_;
+    Solver                                   &solver_;
+    GridBase                                 *fGrid_, *frbGrid_, *gGrid_;
+    bool                                     is5d_;
+    FermionField                             src_o_, sol_e_, sol_o_, tmp_, tmp5_;
+    SchurDiagTwoOperator<FMat, FermionField> op_;
+};
+
+/******************************************************************************
+ *                  Methods for V & W all-to-all vectors I/O                  *
+ ******************************************************************************/
+class A2AVectorsIo
+{
+public:
+    struct Record: Serializable
+    {
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Record,
+                                        unsigned int, index);
+        Record(void): index(0) {}
+    };
+public:
+    template <typename Field>
+    static void write(const std::string fileStem, std::vector<Field> &vec, 
+                      const bool multiFile, const int trajectory = -1);
+    template <typename Field>
+    static void read(std::vector<Field> &vec, const std::string fileStem,
+                     const bool multiFile, const int trajectory = -1);
+private:
+    static inline std::string vecFilename(const std::string stem, const int traj, 
+                                          const bool multiFile)
+    {
+        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+
+        if (multiFile)
+        {
+            return stem + t;
+        }
+        else
+        {
+            return stem + t + ".bin";
+        }
+    }
+};
+
+/******************************************************************************
+ *               A2AVectorsSchurDiagTwo template implementation               *
+ ******************************************************************************/
+template <typename FImpl>
+A2AVectorsSchurDiagTwo<FImpl>::A2AVectorsSchurDiagTwo(FMat &action, Solver &solver)
+: action_(action)
+, solver_(solver)
+, fGrid_(action_.FermionGrid())
+, frbGrid_(action_.FermionRedBlackGrid())
+, gGrid_(action_.GaugeGrid())
+, src_o_(frbGrid_)
+, sol_e_(frbGrid_)
+, sol_o_(frbGrid_)
+, tmp_(frbGrid_)
+, tmp5_(fGrid_)
+, op_(action_)
+{}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV(FermionField &vout, const FermionField &evec, const Real &eval)
+{
+    src_o_ = evec;
+    src_o_.Checkerboard() = Odd;
+    pickCheckerboard(Even, sol_e_, vout);
+    pickCheckerboard(Odd, sol_o_, vout);
+
+    /////////////////////////////////////////////////////
+    // v_ie = -(1/eval_i) * MeeInv Meo MooInv evec_i
+    /////////////////////////////////////////////////////
+    action_.MooeeInv(src_o_, tmp_);
+    assert(tmp_.Checkerboard() == Odd);
+    action_.Meooe(tmp_, sol_e_);
+    assert(sol_e_.Checkerboard() == Even);
+    action_.MooeeInv(sol_e_, tmp_);
+    assert(tmp_.Checkerboard() == Even);
+    sol_e_ = (-1.0 / eval) * tmp_;
+    assert(sol_e_.Checkerboard() == Even);
+
+    /////////////////////////////////////////////////////
+    // v_io = (1/eval_i) * MooInv evec_i
+    /////////////////////////////////////////////////////
+    action_.MooeeInv(src_o_, tmp_);
+    assert(tmp_.Checkerboard() == Odd);
+    sol_o_ = (1.0 / eval) * tmp_;
+    assert(sol_o_.Checkerboard() == Odd);
+    setCheckerboard(vout, sol_e_);
+    assert(sol_e_.Checkerboard() == Even);
+    setCheckerboard(vout, sol_o_);
+    assert(sol_o_.Checkerboard() == Odd);
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeV5D(FermionField &vout_4d, FermionField &vout_5d, const FermionField &evec, const Real &eval)
+{
+    makeLowModeV(vout_5d, evec, eval);
+    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW(FermionField &wout, const FermionField &evec, const Real &eval)
+{
+    src_o_ = evec;
+    src_o_.Checkerboard() = Odd;
+    pickCheckerboard(Even, sol_e_, wout);
+    pickCheckerboard(Odd, sol_o_, wout);
+
+    /////////////////////////////////////////////////////
+    // w_ie = - MeeInvDag MoeDag Doo evec_i
+    /////////////////////////////////////////////////////
+    op_.Mpc(src_o_, tmp_);
+    assert(tmp_.Checkerboard() == Odd);
+    action_.MeooeDag(tmp_, sol_e_);
+    assert(sol_e_.Checkerboard() == Even);
+    action_.MooeeInvDag(sol_e_, tmp_);
+    assert(tmp_.Checkerboard() == Even);
+    sol_e_ = (-1.0) * tmp_;
+
+    /////////////////////////////////////////////////////
+    // w_io = Doo evec_i
+    /////////////////////////////////////////////////////
+    op_.Mpc(src_o_, sol_o_);
+    assert(sol_o_.Checkerboard() == Odd);
+    setCheckerboard(wout, sol_e_);
+    assert(sol_e_.Checkerboard() == Even);
+    setCheckerboard(wout, sol_o_);
+    assert(sol_o_.Checkerboard() == Odd);
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeLowModeW5D(FermionField &wout_4d, 
+                                                   FermionField &wout_5d, 
+                                                   const FermionField &evec, 
+                                                   const Real &eval)
+{
+    makeLowModeW(tmp5_, evec, eval);
+    action_.DminusDag(tmp5_, wout_5d);
+    action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV(FermionField &vout, 
+                                                  const FermionField &noise)
+{
+    solver_(vout, noise);
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeV5D(FermionField &vout_4d, 
+                                                    FermionField &vout_5d, 
+                                                    const FermionField &noise)
+{
+    if (noise.Grid()->Dimensions() == fGrid_->Dimensions() - 1)
+    {
+        action_.ImportPhysicalFermionSource(noise, tmp5_);
+    }
+    else
+    {
+        tmp5_ = noise;
+    }
+    makeHighModeV(vout_5d, tmp5_);
+    action_.ExportPhysicalFermionSolution(vout_5d, vout_4d);
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW(FermionField &wout, 
+                                                  const FermionField &noise)
+{
+    wout = noise;
+}
+
+template <typename FImpl>
+void A2AVectorsSchurDiagTwo<FImpl>::makeHighModeW5D(FermionField &wout_4d, 
+                                                    FermionField &wout_5d, 
+                                                    const FermionField &noise)
+{
+    if (noise.Grid()->Dimensions() == fGrid_->Dimensions() - 1)
+    {
+        action_.ImportUnphysicalFermion(noise, wout_5d);
+        wout_4d = noise;
+    }
+    else
+    {
+        wout_5d = noise;
+        action_.ExportPhysicalFermionSource(wout_5d, wout_4d);
+    }
+}
+
+/******************************************************************************
+ *               all-to-all vectors I/O template implementation               *
+ ******************************************************************************/
+template <typename Field>
+void A2AVectorsIo::write(const std::string fileStem, std::vector<Field> &vec, 
+                         const bool multiFile, const int trajectory)
+{
+    Record       record;
+    GridBase     *grid = vec[0].Grid();
+    ScidacWriter binWriter(grid->IsBoss());
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Writing vector " << i << std::endl;
+            makeFileDir(fullFilename, grid);
+            binWriter.open(fullFilename);
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+            binWriter.close();
+        }
+    }
+    else
+    {
+        makeFileDir(filename, grid);
+        binWriter.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Writing vector " << i << std::endl;
+            record.index = i;
+            binWriter.writeScidacFieldRecord(vec[i], record);
+        }
+        binWriter.close();
+    }
+}
+
+template <typename Field>
+void A2AVectorsIo::read(std::vector<Field> &vec, const std::string fileStem, 
+                        const bool multiFile, const int trajectory)
+{
+    Record       record;
+    ScidacReader binReader;
+    std::string  filename = vecFilename(fileStem, trajectory, multiFile);
+
+    if (multiFile)
+    {
+        std::string fullFilename;
+
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            fullFilename = filename + "/elem" + std::to_string(i) + ".bin";
+
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.open(fullFilename);
+            binReader.readScidacFieldRecord(vec[i], record);
+            binReader.close();
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+    }
+    else
+    {
+        binReader.open(filename);
+        for (unsigned int i = 0; i < vec.size(); ++i)
+        {
+            LOG(Message) << "Reading vector " << i << std::endl;
+            binReader.readScidacFieldRecord(vec[i], record);
+            if (record.index != i)
+            {
+                HADRONS_ERROR(Io, "vector index mismatch");
+            }
+        }
+        binReader.close();
+    }
+}
+
+END_HADRONS_NAMESPACE
+
+#endif // A2A_Vectors_hpp_
diff --git a/extras/Hadrons/Application.cc b/Hadrons/Application.cc
similarity index 72%
rename from extras/Hadrons/Application.cc
rename to Hadrons/Application.cc
index 649cae9b..7c370b7c 100644
--- a/extras/Hadrons/Application.cc
+++ b/Hadrons/Application.cc
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Application.cc
+Source file: Hadrons/Application.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -27,28 +26,27 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Application.hpp>
-#include <Grid/Hadrons/GeneticScheduler.hpp>
+#include <Hadrons/Application.hpp>
+#include <Hadrons/GeneticScheduler.hpp>
+#include <Hadrons/Modules.hpp>
 
 using namespace Grid;
  
 using namespace Hadrons;
 
-#define BIG_SEP "==============="
-#define SEP     "---------------"
+#define BIG_SEP "================"
+#define SEP     "----------------"
 
 /******************************************************************************
  *                       Application implementation                           *
  ******************************************************************************/
 // constructors ////////////////////////////////////////////////////////////////
+#define MACOUT(macro)    macro              << " (" << #macro << ")"
+#define MACOUTS(macro) HADRONS_STR(macro) << " (" << #macro << ")"
+
 Application::Application(void)
 {
-    LOG(Message) << "Modules available:" << std::endl;
-    auto list = ModuleFactory::getInstance().getBuilderList();
-    for (auto &m: list)
-    {
-        LOG(Message) << "  " << m << std::endl;
-    }
+    initLogger();
     auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
     locVol_ = 1;
     for (unsigned int d = 0; d < dim.size(); ++d)
@@ -56,9 +54,22 @@ Application::Application(void)
         loc[d]  /= mpi[d];
         locVol_ *= loc[d];
     }
+    LOG(Message) << "====== HADRONS APPLICATION INITIALISATION ======" << std::endl;
+    LOG(Message) << "** Dimensions" << std::endl;
     LOG(Message) << "Global lattice: " << dim << std::endl;
     LOG(Message) << "MPI partition : " << mpi << std::endl;
     LOG(Message) << "Local lattice : " << loc << std::endl;
+    LOG(Message) << std::endl;
+    LOG(Message) << "** Default parameters (and associated C macros)" << std::endl;
+    LOG(Message) << "ASCII output precision  : " << MACOUT(DEFAULT_ASCII_PREC) << std::endl;
+    LOG(Message) << "Fermion implementation  : " << MACOUTS(FIMPLBASE) << std::endl;
+    LOG(Message) << "z-Fermion implementation: " << MACOUTS(ZFIMPLBASE) << std::endl;
+    LOG(Message) << "Scalar implementation   : " << MACOUTS(SIMPLBASE) << std::endl;
+    LOG(Message) << "Gauge implementation    : " << MACOUTS(GIMPLBASE) << std::endl;
+    LOG(Message) << "Eigenvector base size   : " 
+                 << MACOUT(HADRONS_DEFAULT_LANCZOS_NBASIS) << std::endl;
+    LOG(Message) << "Schur decomposition     : " << MACOUTS(HADRONS_DEFAULT_SCHUR) << std::endl;
+    LOG(Message) << std::endl;
 }
 
 Application::Application(const Application::GlobalPar &par)
@@ -77,7 +88,6 @@ Application::Application(const std::string parameterFileName)
 void Application::setPar(const Application::GlobalPar &par)
 {
     par_ = par;
-    env().setSeed(strToVec<int>(par_.seed));
 }
 
 const Application::GlobalPar & Application::getPar(void)
@@ -88,14 +98,29 @@ const Application::GlobalPar & Application::getPar(void)
 // execute /////////////////////////////////////////////////////////////////////
 void Application::run(void)
 {
+    LOG(Message) << "====== HADRONS APPLICATION START ======" << std::endl;
     if (!parameterFileName_.empty() and (vm().getNModule() == 0))
     {
         parseParameterFile(parameterFileName_);
     }
+    if (getPar().runId.empty())
+    {
+        HADRONS_ERROR(Definition, "run id is empty");
+    }
+    LOG(Message) << "RUN ID '" << getPar().runId << "'" << std::endl;
+    BinaryIO::latticeWriteMaxRetry = getPar().parallelWriteMaxRetry;
+    LOG(Message) << "Attempt(s) for resilient parallel I/O: " 
+                 << BinaryIO::latticeWriteMaxRetry << std::endl;
+    vm().setRunId(getPar().runId);
     vm().printContent();
     env().printContent();
     schedule();
     printSchedule();
+    if (!getPar().graphFile.empty())
+    {
+        makeFileDir(getPar().graphFile, env().getGrid());
+        vm().dumpModuleGraph(getPar().graphFile);
+    }
     configLoop();
 }
 
@@ -119,12 +144,12 @@ void Application::parseParameterFile(const std::string parameterFileName)
     setPar(par);
     if (!push(reader, "modules"))
     {
-        HADRON_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
+        HADRONS_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
                               + parameterFileName + "'");
     }
     if (!push(reader, "module"))
     {
-        HADRON_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
+        HADRONS_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
                               + parameterFileName + "'");
     }
     do
@@ -138,11 +163,13 @@ void Application::parseParameterFile(const std::string parameterFileName)
 
 void Application::saveParameterFile(const std::string parameterFileName)
 {
+    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
+    if (env().getGrid()->IsBoss())
+    {
     XmlWriter          writer(parameterFileName);
     ObjectId           id;
     const unsigned int nMod = vm().getNModule();
     
-    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
     write(writer, "parameters", getPar());
     push(writer, "modules");
     for (unsigned int i = 0; i < nMod; ++i)
@@ -157,6 +184,7 @@ void Application::saveParameterFile(const std::string parameterFileName)
     pop(writer);
     pop(writer);
 }
+}
 
 // schedule computation ////////////////////////////////////////////////////////
 void Application::schedule(void)
@@ -170,21 +198,25 @@ void Application::schedule(void)
 
 void Application::saveSchedule(const std::string filename)
 {
+    LOG(Message) << "Saving current schedule to '" << filename << "'..."
+                 << std::endl;
+    if (env().getGrid()->IsBoss())
+    {
     TextWriter               writer(filename);
     std::vector<std::string> program;
     
     if (!scheduled_)
     {
-        HADRON_ERROR(Definition, "Computation not scheduled");
+            HADRONS_ERROR(Definition, "Computation not scheduled");
     }
-    LOG(Message) << "Saving current schedule to '" << filename << "'..."
-                 << std::endl;
+
     for (auto address: program_)
     {
         program.push_back(vm().getModuleName(address));
     }
     write(writer, "schedule", program);
 }
+}
 
 void Application::loadSchedule(const std::string filename)
 {
@@ -200,13 +232,14 @@ void Application::loadSchedule(const std::string filename)
         program_.push_back(vm().getModuleAddress(name));
     }
     loadedSchedule_ = true;
+    scheduled_      = true;
 }
 
 void Application::printSchedule(void)
 {
-    if (!scheduled_)
+    if (!scheduled_ and !loadedSchedule_)
     {
-        HADRON_ERROR(Definition, "Computation not scheduled");
+        HADRONS_ERROR(Definition, "Computation not scheduled");
     }
     auto peak = vm().memoryNeeded(program_);
     LOG(Message) << "Schedule (memory needed: " << sizeString(peak) << "):"
diff --git a/extras/Hadrons/Application.hpp b/Hadrons/Application.hpp
similarity index 91%
rename from extras/Hadrons/Application.hpp
rename to Hadrons/Application.hpp
index 4b2ce77b..3578c919 100644
--- a/extras/Hadrons/Application.hpp
+++ b/Hadrons/Application.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Application.hpp
+Source file: Hadrons/Application.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,9 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_Application_hpp_
 #define Hadrons_Application_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/VirtualMachine.hpp>
-#include <Grid/Hadrons/Modules.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/VirtualMachine.hpp>
+#include <Hadrons/Module.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -56,7 +55,10 @@ public:
         GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
                                         TrajRange,                  trajCounter,
                                         VirtualMachine::GeneticPar, genetic,
-                                        std::string,                seed);
+                                        std::string,                runId,
+                                        std::string,                graphFile,
+                                        int,                        parallelWriteMaxRetry);
+        GlobalPar(void): parallelWriteMaxRetry{-1} {}
     };
 public:
     // constructors
diff --git a/Hadrons/DilutedNoise.hpp b/Hadrons/DilutedNoise.hpp
new file mode 100644
index 00000000..3867d763
--- /dev/null
+++ b/Hadrons/DilutedNoise.hpp
@@ -0,0 +1,249 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/DilutedNoise.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_DilutedNoise_hpp_
+#define Hadrons_DilutedNoise_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                   Abstract container for diluted noise                     *
+ ******************************************************************************/
+template <typename FImpl>
+class DilutedNoise
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    // constructor/destructor
+    DilutedNoise(GridCartesian *g);
+    DilutedNoise(GridCartesian *g, const unsigned int nNoise);
+    virtual ~DilutedNoise(void) = default;
+    // access
+    std::vector<FermionField> &       getNoise(void);
+    const std::vector<FermionField> & getNoise(void) const;
+    const FermionField &              operator[](const unsigned int i) const;
+    FermionField &                    operator[](const unsigned int i);
+    void                              resize(const unsigned int nNoise);
+    unsigned int                      size(void) const;
+    GridCartesian                     *getGrid(void) const;
+    // generate noise (pure virtual)
+    virtual void generateNoise(GridParallelRNG &rng) = 0;
+private:
+    std::vector<FermionField> noise_;
+    GridCartesian             *grid_;
+    unsigned int              nNoise_;
+};
+
+template <typename FImpl>
+class TimeDilutedSpinColorDiagonalNoise: public DilutedNoise<FImpl>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    // constructor/destructor
+    TimeDilutedSpinColorDiagonalNoise(GridCartesian *g);
+    virtual ~TimeDilutedSpinColorDiagonalNoise(void) = default;
+    // generate noise
+    virtual void generateNoise(GridParallelRNG &rng);
+private:
+    unsigned int nt_;
+};
+
+template <typename FImpl>
+class FullVolumeSpinColorDiagonalNoise: public DilutedNoise<FImpl>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    // constructor/destructor
+    FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int n_src);
+    virtual ~FullVolumeSpinColorDiagonalNoise(void) = default;
+    // generate noise
+    virtual void generateNoise(GridParallelRNG &rng);
+private:
+    unsigned int nSrc_;
+};
+
+
+/******************************************************************************
+ *                    DilutedNoise template implementation                    *
+ ******************************************************************************/
+template <typename FImpl>
+DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g)
+: grid_(g)
+{}
+
+template <typename FImpl>
+DilutedNoise<FImpl>::DilutedNoise(GridCartesian *g,
+                                  const unsigned int nNoise)
+: DilutedNoise(g)
+{
+    resize(nNoise);
+}
+
+template <typename FImpl>
+std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
+getNoise(void)
+{
+    return noise_;
+}
+
+template <typename FImpl>
+const std::vector<typename DilutedNoise<FImpl>::FermionField> & DilutedNoise<FImpl>::
+getNoise(void) const
+{
+    return noise_;
+}
+
+template <typename FImpl>
+const typename DilutedNoise<FImpl>::FermionField & 
+DilutedNoise<FImpl>::operator[](const unsigned int i) const
+{
+    return noise_[i];
+}
+
+template <typename FImpl>
+typename DilutedNoise<FImpl>::FermionField & 
+DilutedNoise<FImpl>::operator[](const unsigned int i)
+{
+    return noise_[i];
+}
+
+template <typename FImpl>
+void DilutedNoise<FImpl>::resize(const unsigned int nNoise)
+{
+    nNoise_ = nNoise;
+    noise_.resize(nNoise, grid_);
+}
+
+template <typename FImpl>
+unsigned int DilutedNoise<FImpl>::size(void) const
+{  
+    return noise_.size();
+}
+
+template <typename FImpl>
+GridCartesian * DilutedNoise<FImpl>::getGrid(void) const
+{
+    return grid_;
+}
+
+/******************************************************************************
+ *        TimeDilutedSpinColorDiagonalNoise template implementation           *
+ ******************************************************************************/
+template <typename FImpl>
+TimeDilutedSpinColorDiagonalNoise<FImpl>::
+TimeDilutedSpinColorDiagonalNoise(GridCartesian *g)
+: DilutedNoise<FImpl>(g)
+{
+    nt_ = this->getGrid()->GlobalDimensions().size();
+    this->resize(nt_*Ns*FImpl::Dimension);
+}
+
+template <typename FImpl>
+void TimeDilutedSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
+{
+    typedef decltype(peekColour((*this)[0], 0)) SpinField;
+
+    auto                       &noise = *this;
+    auto                       g      = this->getGrid();
+    auto                       nd     = g->GlobalDimensions().size();
+    auto                       nc     = FImpl::Dimension;
+    Complex                    shift(1., 1.);
+    Lattice<iScalar<vInteger>> tLat(g);
+    LatticeComplex             eta(g), etaCut(g);
+    SpinField                  etas(g);
+    unsigned int               i = 0;
+
+    LatticeCoordinate(tLat, nd - 1);
+    bernoulli(rng, eta);
+    eta = (2.*eta - shift)*(1./::sqrt(2.));
+    for (unsigned int t = 0; t < nt_; ++t)
+    {
+        etaCut = where((tLat == t), eta, 0.*eta);
+        for (unsigned int s = 0; s < Ns; ++s)
+        {
+	    etas = Zero();
+	    pokeSpin(etas, etaCut, s);
+            for (unsigned int c = 0; c < nc; ++c)
+            {
+  	        noise[i] = Zero();
+                pokeColour(noise[i], etas, c);
+                i++;
+            }
+        }
+    }
+}
+
+/******************************************************************************
+ *        FullVolumeSpinColorDiagonalNoise template implementation           *
+ ******************************************************************************/
+template <typename FImpl>
+FullVolumeSpinColorDiagonalNoise<FImpl>::
+FullVolumeSpinColorDiagonalNoise(GridCartesian *g, unsigned int nSrc)
+: DilutedNoise<FImpl>(g, nSrc*Ns*FImpl::Dimension), nSrc_(nSrc)
+{}
+
+template <typename FImpl>
+void FullVolumeSpinColorDiagonalNoise<FImpl>::generateNoise(GridParallelRNG &rng)
+{
+    typedef decltype(peekColour((*this)[0], 0)) SpinField;
+
+    auto                       &noise = *this;
+    auto                       g      = this->getGrid();
+    auto                       nd     = g->GlobalDimensions().size();
+    auto                       nc     = FImpl::Dimension;
+    Complex                    shift(1., 1.);
+    LatticeComplex             eta(g);
+    SpinField                  etas(g);
+    unsigned int               i = 0;
+
+    bernoulli(rng, eta);
+    eta = (2.*eta - shift)*(1./::sqrt(2.));
+    for (unsigned int n = 0; n < nSrc_; ++n)
+    {
+        for (unsigned int s = 0; s < Ns; ++s)
+        {
+  	    etas = Zero();
+            pokeSpin(etas, eta, s);
+            for (unsigned int c = 0; c < nc; ++c)
+            {
+	        noise[i] = Zero();
+                pokeColour(noise[i], etas, c);
+                i++;
+            }
+        }
+    }
+}
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_DilutedNoise_hpp_
diff --git a/Hadrons/DiskVector.hpp b/Hadrons/DiskVector.hpp
new file mode 100644
index 00000000..70a00b16
--- /dev/null
+++ b/Hadrons/DiskVector.hpp
@@ -0,0 +1,442 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/DiskVector.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_DiskVector_hpp_
+#define Hadrons_DiskVector_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#include <deque>
+#include <sys/stat.h>
+#include <ftw.h>
+#include <unistd.h>
+
+#ifdef DV_DEBUG
+#define DV_DEBUG_MSG(dv, stream) LOG(Debug) << "diskvector " << (dv) << ": " << stream << std::endl
+#else
+#define DV_DEBUG_MSG(dv, stream)
+#endif
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                           Abstract base class                              *
+ ******************************************************************************/
+template <typename T>
+class DiskVectorBase
+{
+public:
+    typedef T ObjectType;
+
+    // helper for read/write vector access
+    class RwAccessHelper
+    {
+    public:
+        RwAccessHelper(DiskVectorBase<T> &master, const unsigned int i)
+        : master_(master), cmaster_(master), i_(i) {}
+
+        // operator=: somebody is trying to store a vector element
+        // write to cache and tag as modified
+        T &operator=(const T &obj) const
+        {
+            auto &cache    = *master_.cachePtr_;
+            auto &modified = *master_.modifiedPtr_;
+            auto &index    = *master_.indexPtr_;
+
+            DV_DEBUG_MSG(&master_, "writing to " << i_);
+            master_.cacheInsert(i_, obj);
+            modified[index.at(i_)] = true;
+            
+            return cache[index.at(i_)];
+        }
+
+        // implicit cast to const object reference and redirection
+        // to the const operator[] for read-only operations
+        operator const T&() const
+        {
+            return cmaster_[i_];
+        }
+    private:
+        DiskVectorBase<T>       &master_;
+        const DiskVectorBase<T> &cmaster_;
+        const unsigned int      i_;
+    };
+public:
+    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
+                   const unsigned int cacheSize = 1, const bool clean = true);
+    DiskVectorBase(DiskVectorBase<T> &&v) = default;
+    virtual ~DiskVectorBase(void);
+    const T & operator[](const unsigned int i) const;
+    RwAccessHelper operator[](const unsigned int i);
+    double hitRatio(void) const;
+    void resetStat(void);
+private:
+    virtual void load(T &obj, const std::string filename) const = 0;
+    virtual void save(const std::string filename, const T &obj) const = 0;
+    virtual std::string filename(const unsigned int i) const;
+    void evict(void) const;
+    void fetch(const unsigned int i) const;
+    void cacheInsert(const unsigned int i, const T &obj) const;
+    void clean(void);
+private:
+    std::string                                           dirname_;
+    unsigned int                                          size_, cacheSize_;
+    double                                                access_{0.}, hit_{0.};
+    bool                                                  clean_;
+    // using pointers to allow modifications when class is const
+    // semantic: const means data unmodified, but cache modification allowed
+    std::unique_ptr<std::vector<T>>                       cachePtr_;
+    std::unique_ptr<std::vector<bool>>                    modifiedPtr_;
+    std::unique_ptr<std::map<unsigned int, unsigned int>> indexPtr_;
+    std::unique_ptr<std::stack<unsigned int>>             freePtr_;
+    std::unique_ptr<std::deque<unsigned int>>             loadsPtr_;                
+};
+
+/******************************************************************************
+ *                   Specialisation for serialisable classes                  *
+ ******************************************************************************/
+template <typename T, typename Reader, typename Writer>
+class SerializableDiskVector: public DiskVectorBase<T>
+{
+public:
+    using DiskVectorBase<T>::DiskVectorBase;
+private:
+    virtual void load(T &obj, const std::string filename) const
+    {
+        Reader reader(filename);
+
+        read(reader, basename(filename), obj);
+    }
+
+    virtual void save(const std::string filename, const T &obj) const
+    {
+        Writer writer(filename);
+
+        write(writer, basename(filename), obj);
+    }
+};
+
+/******************************************************************************
+ *                      Specialisation for Eigen matrices                     *
+ ******************************************************************************/
+template <typename T>
+using EigenDiskVectorMat = A2AMatrix<T>;
+
+template <typename T>
+class EigenDiskVector: public DiskVectorBase<EigenDiskVectorMat<T>>
+{
+public:
+    using DiskVectorBase<EigenDiskVectorMat<T>>::DiskVectorBase;
+    typedef EigenDiskVectorMat<T> Matrix;
+public:
+    T operator()(const unsigned int i, const Eigen::Index j,
+                 const Eigen::Index k) const
+    {
+        return (*this)[i](j, k);
+    }
+private:
+    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
+    {
+        std::ifstream f(filename, std::ios::binary);
+        uint32_t      crc, check;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tRead, tHash;
+
+        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+        obj.resize(nRow, nCol);
+        matSize = nRow*nCol*sizeof(T);
+        tRead  = -usecond();
+        f.read(reinterpret_cast<char *>(obj.data()), matSize);
+        tRead += usecond();
+        tHash  = -usecond();
+#ifdef USE_IPP
+        check  = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        check  = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash += usecond();
+        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        if (crc != check)
+        {
+            HADRONS_ERROR(Io, "checksum failed")
+        }
+    }
+
+    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
+    {
+        std::ofstream f(filename, std::ios::binary);
+        uint32_t      crc;
+        Eigen::Index  nRow, nCol;
+        size_t        matSize;
+        double        tWrite, tHash;
+        
+        nRow    = obj.rows();
+        nCol    = obj.cols();
+        matSize = nRow*nCol*sizeof(T);
+        tHash   = -usecond();
+#ifdef USE_IPP
+        crc     = GridChecksum::crc32c(obj.data(), matSize);
+#else
+        crc     = GridChecksum::crc32(obj.data(), matSize);
+#endif
+        tHash  += usecond();
+        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
+        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+        tWrite = -usecond();
+        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
+        tWrite += usecond();
+        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
+        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
+                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+    }
+};
+
+/******************************************************************************
+ *                       DiskVectorBase implementation                         *
+ ******************************************************************************/
+template <typename T>
+DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
+                                  const unsigned int size,
+                                  const unsigned int cacheSize,
+                                  const bool clean)
+: dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
+, cachePtr_(new std::vector<T>(size))
+, modifiedPtr_(new std::vector<bool>(size, false))
+, indexPtr_(new std::map<unsigned int, unsigned int>())
+, freePtr_(new std::stack<unsigned int>)
+, loadsPtr_(new std::deque<unsigned int>())
+{
+    struct stat s;
+
+    if(stat(dirname.c_str(), &s) == 0)
+    {
+        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
+    }
+    mkdir(dirname);
+    for (unsigned int i = 0; i < cacheSize_; ++i)
+    {
+        freePtr_->push(i);
+    }
+}
+
+template <typename T>
+DiskVectorBase<T>::~DiskVectorBase(void)
+{
+    if (clean_)
+    {
+        clean();
+    }
+}
+
+template <typename T>
+const T & DiskVectorBase<T>::operator[](const unsigned int i) const
+{
+    auto &cache   = *cachePtr_;
+    auto &index   = *indexPtr_;
+    auto &freeInd = *freePtr_;
+    auto &loads   = *loadsPtr_;
+
+    DV_DEBUG_MSG(this, "accessing " << i << " (RO)");
+
+    if (i >= size_)
+    {
+        HADRONS_ERROR(Size, "index out of range");
+    }
+    const_cast<double &>(access_)++;
+    if (index.find(i) == index.end())
+    {
+        // cache miss
+        DV_DEBUG_MSG(this, "cache miss");
+        fetch(i);
+    }
+    else
+    {
+        DV_DEBUG_MSG(this, "cache hit");
+
+        auto pos = std::find(loads.begin(), loads.end(), i);
+
+        const_cast<double &>(hit_)++;
+        loads.erase(pos);
+        loads.push_back(i);
+    }
+
+#ifdef DV_DEBUG
+    std::string msg;
+
+    for (auto &p: loads)
+    {
+        msg += std::to_string(p) + " ";
+    }
+    DV_DEBUG_MSG(this, "in cache: " << msg);
+#endif
+
+    return cache[index.at(i)];
+}
+
+template <typename T>
+typename DiskVectorBase<T>::RwAccessHelper DiskVectorBase<T>::operator[](const unsigned int i)
+{
+    DV_DEBUG_MSG(this, "accessing " << i << " (RW)");
+
+    if (i >= size_)
+    {
+        HADRONS_ERROR(Size, "index out of range");
+    }
+
+    return RwAccessHelper(*this, i);
+}
+
+template <typename T>
+double DiskVectorBase<T>::hitRatio(void) const
+{
+    return hit_/access_;
+}
+
+template <typename T>
+void DiskVectorBase<T>::resetStat(void)
+{
+    access_ = 0.;
+    hit_    = 0.;
+}
+
+template <typename T>
+std::string DiskVectorBase<T>::filename(const unsigned int i) const
+{
+    return dirname_ + "/elem_" + std::to_string(i);
+}
+
+template <typename T>
+void DiskVectorBase<T>::evict(void) const
+{
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;
+
+    if (index.size() >= cacheSize_)
+    {
+        unsigned int i = loads.front();
+        
+        DV_DEBUG_MSG(this, "evicting " << i);
+        if (modified[index.at(i)])
+        {
+            DV_DEBUG_MSG(this, "element " << i << " modified, saving to disk");
+            save(filename(i), cache[index.at(i)]);
+        }
+        freeInd.push(index.at(i));
+        index.erase(i);
+        loads.pop_front();
+    }
+}
+
+template <typename T>
+void DiskVectorBase<T>::fetch(const unsigned int i) const
+{
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;
+
+    struct stat s;
+
+    DV_DEBUG_MSG(this, "loading " << i << " from disk");
+
+    evict();
+    
+    if(stat(filename(i).c_str(), &s) != 0)
+    {
+        HADRONS_ERROR(Io, "disk vector element " + std::to_string(i) + " uninitialised");
+    }
+    index[i] = freeInd.top();
+    freeInd.pop();
+    load(cache[index.at(i)], filename(i));
+    loads.push_back(i);
+    modified[index.at(i)] = false;
+}
+
+template <typename T>
+void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
+{
+    auto &cache    = *cachePtr_;
+    auto &modified = *modifiedPtr_;
+    auto &index    = *indexPtr_;
+    auto &freeInd  = *freePtr_;
+    auto &loads    = *loadsPtr_;
+
+    evict();
+    index[i] = freeInd.top();
+    freeInd.pop();
+    cache[index.at(i)] = obj;
+    loads.push_back(i);
+    modified[index.at(i)] = false;
+
+#ifdef DV_DEBUG
+    std::string msg;
+
+    for (auto &p: loads)
+    {
+        msg += std::to_string(p) + " ";
+    }
+    DV_DEBUG_MSG(this, "in cache: " << msg);
+#endif
+}
+
+#ifdef DV_DEBUG
+#undef DV_DEBUG_MSG
+#endif
+
+template <typename T>
+void DiskVectorBase<T>::clean(void)
+{
+    auto unlink = [](const char *fpath, const struct stat *sb, 
+                     int typeflag, struct FTW *ftwbuf)
+    {
+        int rv = remove(fpath);
+
+        if (rv)
+        {
+            HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': "
+                          + std::string(std::strerror(errno)));
+        }
+
+        return rv;
+    };
+
+    nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
+}
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_DiskVector_hpp_
diff --git a/Hadrons/EigenPack.hpp b/Hadrons/EigenPack.hpp
new file mode 100644
index 00000000..0ce44e37
--- /dev/null
+++ b/Hadrons/EigenPack.hpp
@@ -0,0 +1,414 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/EigenPack.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_EigenPack_hpp_
+#define Hadrons_EigenPack_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Grid/algorithms/iterative/Deflation.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+BEGIN_HADRONS_NAMESPACE
+
+// Lanczos type
+#ifndef HADRONS_DEFAULT_LANCZOS_NBASIS
+#define HADRONS_DEFAULT_LANCZOS_NBASIS 60
+#endif
+
+#define HADRONS_DUMP_EP_METADATA(record) \
+LOG(Message) << "Eigenpack metadata:" << std::endl;\
+LOG(Message) << "* operator" << std::endl;\
+LOG(Message) << (record).operatorXml << std::endl;\
+LOG(Message) << "* solver" << std::endl;\
+LOG(Message) << (record).solverXml << std::endl;
+
+struct PackRecord
+{
+    std::string operatorXml, solverXml;
+};
+
+struct VecRecord: Serializable
+{
+    GRID_SERIALIZABLE_CLASS_MEMBERS(VecRecord,
+                                    unsigned int, index,
+                                    double,       eval);
+    VecRecord(void): index(0), eval(0.) {}
+};
+
+namespace EigenPackIo
+{
+    inline void readHeader(PackRecord &record, ScidacReader &binReader)
+    {
+        std::string recordXml;
+
+        binReader.readLimeObject(recordXml, SCIDAC_FILE_XML);
+        XmlReader xmlReader(recordXml, true, "eigenPackPar");
+        xmlReader.push();
+        xmlReader.readCurrentSubtree(record.operatorXml);
+        xmlReader.nextElement();
+        xmlReader.readCurrentSubtree(record.solverXml);
+    }
+
+    template <typename T, typename TIo = T>
+    void readElement(T &evec, RealD &eval, const unsigned int index,
+                     ScidacReader &binReader, TIo *ioBuf = nullptr)
+    {
+        VecRecord vecRecord;
+
+        LOG(Message) << "Reading eigenvector " << index << std::endl;
+        if (ioBuf == nullptr)
+        {
+            binReader.readScidacFieldRecord(evec, vecRecord);
+        }
+        else
+        {
+            binReader.readScidacFieldRecord(*ioBuf, vecRecord);
+            precisionChange(evec, *ioBuf);
+        }
+        if (vecRecord.index != index)
+        {
+            HADRONS_ERROR(Io, "Eigenvector " + std::to_string(index) + " has a"
+                            + " wrong index (expected " + std::to_string(vecRecord.index) 
+                            + ")");
+        }
+        eval = vecRecord.eval;
+    }
+
+    template <typename T, typename TIo = T>
+    static void readPack(std::vector<T> &evec, std::vector<RealD> &eval,
+                         PackRecord &record, const std::string filename, 
+                         const unsigned int size, bool multiFile, 
+                         GridBase *gridIo = nullptr)
+    {
+        std::unique_ptr<TIo> ioBuf{nullptr};
+        ScidacReader         binReader;
+
+        if (typeHash<T>() != typeHash<TIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+            ioBuf.reset(new TIo(gridIo));
+        }
+        if (multiFile)
+        {
+            std::string fullFilename;
+
+            for(int k = 0; k < size; ++k) 
+            {
+                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+                binReader.open(fullFilename);
+                readHeader(record, binReader);
+                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
+                binReader.close();
+            }
+        }
+        else
+        {
+            binReader.open(filename);
+            readHeader(record, binReader);
+            for(int k = 0; k < size; ++k) 
+            {
+                readElement(evec[k], eval[k], k, binReader, ioBuf.get());
+            }
+            binReader.close();
+        }
+    }
+
+    inline void writeHeader(ScidacWriter &binWriter, PackRecord &record)
+    {
+        XmlWriter xmlWriter("", "eigenPackPar");
+
+        xmlWriter.pushXmlString(record.operatorXml);
+        xmlWriter.pushXmlString(record.solverXml);
+        binWriter.writeLimeObject(1, 1, xmlWriter, "parameters", SCIDAC_FILE_XML);
+    }
+
+    template <typename T, typename TIo = T>
+    void writeElement(ScidacWriter &binWriter, T &evec, RealD &eval, 
+                      const unsigned int index, TIo *ioBuf, 
+                      T *testBuf = nullptr)
+    {
+        VecRecord vecRecord;
+
+        LOG(Message) << "Writing eigenvector " << index << std::endl;
+        vecRecord.eval  = eval;
+        vecRecord.index = index;
+        if ((ioBuf == nullptr) || (testBuf == nullptr))
+        {
+            binWriter.writeScidacFieldRecord(evec, vecRecord, DEFAULT_ASCII_PREC);
+        }
+        else
+        {
+            precisionChange(*ioBuf, evec);
+            precisionChange(*testBuf, *ioBuf);
+            *testBuf -= evec;
+            LOG(Message) << "Precision diff norm^2 " << norm2(*testBuf) << std::endl;
+            binWriter.writeScidacFieldRecord(*ioBuf, vecRecord, DEFAULT_ASCII_PREC);
+        }   
+    }
+    
+    template <typename T, typename TIo = T>
+    static void writePack(const std::string filename, std::vector<T> &evec, 
+                          std::vector<RealD> &eval, PackRecord &record, 
+                          const unsigned int size, bool multiFile, 
+                          GridBase *gridIo = nullptr)
+    {
+        GridBase             *grid = evec[0].Grid();
+        std::unique_ptr<TIo> ioBuf{nullptr}; 
+        std::unique_ptr<T>   testBuf{nullptr};
+        ScidacWriter         binWriter(grid->IsBoss());
+
+        if (typeHash<T>() != typeHash<TIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+            ioBuf.reset(new TIo(gridIo));
+            testBuf.reset(new T(grid));
+        }
+        if (multiFile)
+        {
+            std::string fullFilename;
+
+            for(int k = 0; k < size; ++k) 
+            {
+                fullFilename = filename + "/v" + std::to_string(k) + ".bin";
+
+                makeFileDir(fullFilename, grid);
+                binWriter.open(fullFilename);
+                writeHeader(binWriter, record);
+                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+                binWriter.close();
+            }
+        }
+        else
+        {
+            makeFileDir(filename, grid);
+            binWriter.open(filename);
+            writeHeader(binWriter, record);
+            for(int k = 0; k < size; ++k) 
+            {
+                writeElement(binWriter, evec[k], eval[k], k, ioBuf.get(), testBuf.get());
+            }
+            binWriter.close();
+        }
+    }
+}
+
+template <typename F>
+class BaseEigenPack
+{
+public:
+    typedef F Field;
+public:
+    std::vector<RealD> eval;
+    std::vector<F>     evec;
+    PackRecord         record;
+public:
+    BaseEigenPack(void)          = default;
+    BaseEigenPack(const size_t size, GridBase *grid)
+    {
+        resize(size, grid);
+    }
+    virtual ~BaseEigenPack(void) = default;
+    void resize(const size_t size, GridBase *grid)
+    {
+        eval.resize(size);
+        evec.resize(size, grid);
+    }
+};
+
+template <typename F, typename FIo = F>
+class EigenPack: public BaseEigenPack<F>
+{
+public:
+    typedef F   Field;
+    typedef FIo FieldIo;
+public:
+    EigenPack(void)          = default;
+    virtual ~EigenPack(void) = default;
+
+    EigenPack(const size_t size, GridBase *grid, GridBase *gridIo = nullptr)
+    : BaseEigenPack<F>(size, grid)
+    {
+        if (typeHash<F>() != typeHash<FIo>())
+        {
+            if (gridIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "I/O type different from vector type but null I/O grid passed");
+            }
+        }
+        gridIo_ = gridIo;
+    }
+
+    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPackIo::readPack<F, FIo>(this->evec, this->eval, this->record, 
+                                      evecFilename(fileStem, traj, multiFile), 
+                                      this->evec.size(), multiFile, gridIo_);
+        HADRONS_DUMP_EP_METADATA(this->record);
+    }
+
+    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPackIo::writePack<F, FIo>(evecFilename(fileStem, traj, multiFile), 
+                                       this->evec, this->eval, this->record, 
+                                       this->evec.size(), multiFile, gridIo_);
+    }
+protected:
+    std::string evecFilename(const std::string stem, const int traj, const bool multiFile)
+    {
+        std::string t = (traj < 0) ? "" : ("." + std::to_string(traj));
+
+        if (multiFile)
+        {
+            return stem + t;
+        }
+        else
+        {
+            return stem + t + ".bin";
+        }
+    }
+protected:
+    GridBase *gridIo_;
+};
+
+template <typename FineF, typename CoarseF, 
+          typename FineFIo = FineF, typename CoarseFIo = CoarseF>
+class CoarseEigenPack: public EigenPack<FineF, FineFIo>
+{
+public:
+    typedef CoarseF CoarseField;         
+    std::vector<CoarseF> evecCoarse;
+    std::vector<RealD>   evalCoarse;
+public:
+    CoarseEigenPack(void)          = default;
+    virtual ~CoarseEigenPack(void) = default;
+
+    CoarseEigenPack(const size_t sizeFine, const size_t sizeCoarse, 
+                    GridBase *gridFine, GridBase *gridCoarse,
+                    GridBase *gridFineIo = nullptr, 
+                    GridBase *gridCoarseIo = nullptr)
+    {
+        if (typeHash<FineF>() != typeHash<FineFIo>())
+        {
+            if (gridFineIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "Fine I/O type different from vector type but null fine I/O grid passed");
+            }
+        }
+        if (typeHash<CoarseF>() != typeHash<CoarseFIo>())
+        {
+            if (gridCoarseIo == nullptr)
+            {
+                HADRONS_ERROR(Definition, 
+                              "Coarse I/O type different from vector type but null coarse I/O grid passed");
+            }
+        }
+        this->gridIo_ = gridFineIo;
+        gridCoarseIo_ = gridCoarseIo;
+        resize(sizeFine, sizeCoarse, gridFine, gridCoarse);
+    }
+
+    void resize(const size_t sizeFine, const size_t sizeCoarse, 
+                GridBase *gridFine, GridBase *gridCoarse)
+    {
+        EigenPack<FineF, FineFIo>::resize(sizeFine, gridFine);
+        evalCoarse.resize(sizeCoarse);
+        evecCoarse.resize(sizeCoarse, gridCoarse);
+    }
+
+    void readFine(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPack<FineF, FineFIo>::read(fileStem + "_fine", multiFile, traj);
+    }
+
+    void readCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        PackRecord dummy;
+
+        EigenPackIo::readPack<CoarseF, CoarseFIo>(evecCoarse, evalCoarse, dummy, 
+                              this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+                              evecCoarse.size(), multiFile, gridCoarseIo_);
+    }
+
+    virtual void read(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        readFine(fileStem, multiFile, traj);
+        readCoarse(fileStem, multiFile, traj);
+    }
+
+    void writeFine(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPack<FineF, FineFIo>::write(fileStem + "_fine", multiFile, traj);
+    }
+
+    void writeCoarse(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        EigenPackIo::writePack<CoarseF, CoarseFIo>(this->evecFilename(fileStem + "_coarse", traj, multiFile), 
+                                                   evecCoarse, evalCoarse, this->record, 
+                                                   evecCoarse.size(), multiFile, gridCoarseIo_);
+    }
+    
+    virtual void write(const std::string fileStem, const bool multiFile, const int traj = -1)
+    {
+        writeFine(fileStem, multiFile, traj);
+        writeCoarse(fileStem, multiFile, traj);
+    }
+private:
+    GridBase *gridCoarseIo_;
+};
+
+template <typename FImpl>
+using BaseFermionEigenPack = BaseEigenPack<typename FImpl::FermionField>;
+
+template <typename FImpl, typename FImplIo = FImpl>
+using FermionEigenPack = EigenPack<typename FImpl::FermionField, typename FImplIo::FermionField>;
+
+template <typename FImpl, int nBasis, typename FImplIo = FImpl>
+using CoarseFermionEigenPack = CoarseEigenPack<
+    typename FImpl::FermionField,
+    typename LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
+                                   typename FImpl::SiteComplex, 
+                                   nBasis>::CoarseField,
+    typename FImplIo::FermionField,
+    typename LocalCoherenceLanczos<typename FImplIo::SiteSpinor, 
+                                   typename FImplIo::SiteComplex, 
+                                   nBasis>::CoarseField>;
+
+#undef HADRONS_DUMP_EP_METADATA
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_EigenPack_hpp_
diff --git a/extras/Hadrons/Environment.cc b/Hadrons/Environment.cc
similarity index 77%
rename from extras/Hadrons/Environment.cc
rename to Hadrons/Environment.cc
index 03141b9b..507a306f 100644
--- a/extras/Hadrons/Environment.cc
+++ b/Hadrons/Environment.cc
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Environment.cc
+Source file: Hadrons/Environment.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -27,16 +26,16 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Environment.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Environment.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 using namespace Grid;
  
 using namespace Hadrons;
 
 #define ERROR_NO_ADDRESS(address)\
-HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
+HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + std::to_string(address), address);
 
 /******************************************************************************
  *                       Environment implementation                           *
@@ -46,69 +45,16 @@ Environment::Environment(void)
 {
     dim_ = GridDefaultLatt().toVector();
     nd_  = dim_.size();
-    grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
-        dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
-        GridDefaultMpi()));
-    gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
-    auto loc = getGrid()->LocalDimensions();
-    locVol_ = 1;
-    for (unsigned int d = 0; d < loc.size(); ++d)
+    createGrid<vComplex>(1);
+    vol_ = 1.;
+    for (auto d: dim_)
     {
-        locVol_ *= loc[d];
+        vol_ *= d;
     }
-    rng4d_.reset(new GridParallelRNG(grid4d_.get()));
+    rng4d_.reset(new GridParallelRNG(getGrid()));
 }
 
 // grids ///////////////////////////////////////////////////////////////////////
-void Environment::createGrid(const unsigned int Ls)
-{
-    if (grid5d_.find(Ls) == grid5d_.end())
-    {
-        auto g = getGrid();
-        
-        grid5d_[Ls].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
-        gridRb5d_[Ls].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
-    }
-}
-
-GridCartesian * Environment::getGrid(const unsigned int Ls) const
-{
-    try
-    {
-        if (Ls == 1)
-        {
-            return grid4d_.get();
-        }
-        else
-        {
-            return grid5d_.at(Ls).get();
-        }
-    }
-    catch(std::out_of_range &)
-    {
-        HADRON_ERROR(Definition, "no grid with Ls= " + std::to_string(Ls));
-    }
-}
-
-GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
-{
-    try
-    {
-        if (Ls == 1)
-        {
-            return gridRb4d_.get();
-        }
-        else
-        {
-            return gridRb5d_.at(Ls).get();
-        }
-    }
-    catch(std::out_of_range &)
-    {
-        HADRON_ERROR(Definition, "no red-black 5D grid with Ls= " + std::to_string(Ls));
-    }
-}
-
 unsigned int Environment::getNd(void) const
 {
     return nd_;
@@ -124,17 +70,12 @@ int Environment::getDim(const unsigned int mu) const
     return dim_[mu];
 }
 
-unsigned long int Environment::getLocalVolume(void) const
+double Environment::getVolume(void) const
 {
-    return locVol_;
+    return vol_;
 }
 
 // random number generator /////////////////////////////////////////////////////
-void Environment::setSeed(const std::vector<int> &seed)
-{
-    rng4d_->SeedFixedIntegers(seed);
-}
-
 GridParallelRNG * Environment::get4dRng(void) const
 {
     return rng4d_.get();
@@ -155,7 +96,8 @@ void Environment::addObject(const std::string name, const int moduleAddress)
     }
     else
     {
-        HADRON_ERROR(Definition, "object '" + name + "' already exists");
+        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already exists",
+                          getObjectAddress(name));
     }
 }
 
@@ -178,7 +120,7 @@ unsigned int Environment::getObjectAddress(const std::string name) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no object with name '" + name + "'");
+        HADRONS_ERROR(Definition, "no object with name '" + name + "'");
     }
 }
 
@@ -271,7 +213,7 @@ int Environment::getObjectModule(const std::string name) const
 
 unsigned int Environment::getObjectLs(const unsigned int address) const
 {
-    if (hasObject(address))
+    if (hasCreatedObject(address))
     {
         return object_[address].Ls;
     }
diff --git a/Hadrons/Environment.hpp b/Hadrons/Environment.hpp
new file mode 100644
index 00000000..9841d665
--- /dev/null
+++ b/Hadrons/Environment.hpp
@@ -0,0 +1,585 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Environment.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Environment_hpp_
+#define Hadrons_Environment_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Global environment                                 *
+ ******************************************************************************/
+class Object
+{
+public:
+    Object(void) = default;
+    virtual ~Object(void) = default;
+};
+
+template <typename T>
+class Holder: public Object
+{
+public:
+    Holder(void) = default;
+    Holder(T *pt);
+    virtual ~Holder(void) = default;
+    T &       get(void) const;
+    T *       getPt(void) const;
+    void      reset(T *pt);
+private:
+    std::unique_ptr<T> objPt_{nullptr};
+};
+
+#define DEFINE_ENV_ALIAS \
+inline Environment & env(void) const\
+{\
+    return Environment::getInstance();\
+}
+
+#define DEFINE_ENV_LAMBDA \
+auto env = [](void)->Environment &{return Environment::getInstance();}
+
+class Environment
+{
+    SINGLETON(Environment);
+public:
+    typedef SITE_SIZE_TYPE                         Size;
+    typedef std::unique_ptr<GridCartesian>         GridPt;
+    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
+    typedef std::unique_ptr<GridParallelRNG>       RngPt;
+    enum class Storage {object, cache, temporary};
+private:
+    struct ObjInfo
+    {
+        Size                    size{0};
+        Storage                 storage{Storage::object};
+        unsigned int            Ls{0};
+        const std::type_info    *type{nullptr}, *derivedType{nullptr};
+        std::string             name;
+        int                     module{-1};
+        std::unique_ptr<Object> data{nullptr};
+    };
+    typedef std::pair<size_t, unsigned int>     FineGridKey;
+    typedef std::pair<size_t, std::vector<int>> CoarseGridKey;
+public:
+    // grids
+    template <typename VType = vComplex>
+    void                    createGrid(const unsigned int Ls);
+    template <typename VType = vComplex>
+    void                    createCoarseGrid(const std::vector<int> &blockSize,
+                                             const unsigned int Ls);
+    template <typename VType = vComplex>
+    GridCartesian *         getGrid(void);
+    template <typename VType = vComplex>
+    GridRedBlackCartesian * getRbGrid(void);
+    template <typename VType = vComplex>
+    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize);
+    template <typename VType = vComplex>
+    GridCartesian *         getGrid(const unsigned int Ls);
+    template <typename VType = vComplex>
+    GridRedBlackCartesian * getRbGrid(const unsigned int Ls);
+    template <typename VType = vComplex>
+    GridCartesian *         getCoarseGrid(const std::vector<int> &blockSize,
+                                          const unsigned int Ls);
+    std::vector<int>        getDim(void) const;
+    int                     getDim(const unsigned int mu) const;
+    unsigned int            getNd(void) const;
+    double                  getVolume(void) const;
+    // random number generator
+    GridParallelRNG *       get4dRng(void) const;
+    // general memory management
+    void                    addObject(const std::string name,
+                                      const int moduleAddress = -1);
+    template <typename B, typename T, typename ... Ts>
+    void                    createDerivedObject(const std::string name,
+                                                const Environment::Storage storage,
+                                                const unsigned int Ls,
+                                                Ts && ... args);
+    template <typename T, typename ... Ts>
+    void                    createObject(const std::string name,
+                                         const Environment::Storage storage,
+                                         const unsigned int Ls,
+                                         Ts && ... args);
+    void                    setObjectModule(const unsigned int objAddress,
+                                            const int modAddress);
+    template <typename B, typename T>
+    T *                     getDerivedObject(const unsigned int address) const;
+    template <typename B, typename T>
+    T *                     getDerivedObject(const std::string name) const;
+    template <typename T>
+    T *                     getObject(const unsigned int address) const;
+    template <typename T>
+    T *                     getObject(const std::string name) const;
+    unsigned int            getMaxAddress(void) const;
+    unsigned int            getObjectAddress(const std::string name) const;
+    std::string             getObjectName(const unsigned int address) const;
+    std::string             getObjectType(const unsigned int address) const;
+    std::string             getObjectType(const std::string name) const;
+    Size                    getObjectSize(const unsigned int address) const;
+    Size                    getObjectSize(const std::string name) const;
+    Storage                 getObjectStorage(const unsigned int address) const;
+    Storage                 getObjectStorage(const std::string name) const;
+    int                     getObjectModule(const unsigned int address) const;
+    int                     getObjectModule(const std::string name) const;
+    unsigned int            getObjectLs(const unsigned int address) const;
+    unsigned int            getObjectLs(const std::string name) const;
+    bool                    hasObject(const unsigned int address) const;
+    bool                    hasObject(const std::string name) const;
+    bool                    hasCreatedObject(const unsigned int address) const;
+    bool                    hasCreatedObject(const std::string name) const;
+    bool                    isObject5d(const unsigned int address) const;
+    bool                    isObject5d(const std::string name) const;
+    template <typename T>
+    bool                    isObjectOfType(const unsigned int address) const;
+    template <typename T>
+    bool                    isObjectOfType(const std::string name) const;
+    Environment::Size       getTotalSize(void) const;
+    void                    freeObject(const unsigned int address);
+    void                    freeObject(const std::string name);
+    void                    freeAll(void);
+    void                    protectObjects(const bool protect);
+    bool                    objectsProtected(void) const;
+    // print environment content
+    void                    printContent(void) const;
+private:
+    // general
+    double                              vol_;
+    bool                                protect_{true};
+    // grids
+    std::vector<int>                    dim_;
+    std::map<FineGridKey, GridPt>       grid4d_;
+    std::map<FineGridKey, GridPt>       grid5d_;
+    std::map<FineGridKey, GridRbPt>     gridRb4d_;
+    std::map<FineGridKey, GridRbPt>     gridRb5d_;
+    std::map<CoarseGridKey, GridPt>     gridCoarse4d_;
+    std::map<CoarseGridKey, GridPt>     gridCoarse5d_;
+    unsigned int                        nd_;
+    // random number generator
+    RngPt                               rng4d_;
+    // object store
+    std::vector<ObjInfo>                object_;
+    std::map<std::string, unsigned int> objectAddress_;
+};
+
+/******************************************************************************
+ *                       Holder template implementation                       *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename T>
+Holder<T>::Holder(T *pt)
+: objPt_(pt)
+{}
+
+// access //////////////////////////////////////////////////////////////////////
+template <typename T>
+T & Holder<T>::get(void) const
+{
+    return *objPt_.get();
+}
+
+template <typename T>
+T * Holder<T>::getPt(void) const
+{
+    return objPt_.get();
+}
+
+template <typename T>
+void Holder<T>::reset(T *pt)
+{
+    objPt_.reset(pt);
+}
+
+/******************************************************************************
+ *                     Environment template implementation                    *
+ ******************************************************************************/
+// grids ///////////////////////////////////////////////////////////////////////
+#define HADRONS_DUMP_GRID(...)\
+LOG(Debug) << "New grid " << (__VA_ARGS__) << std::endl;\
+LOG(Debug) << " - cb  : " << (__VA_ARGS__)->_isCheckerBoarded << std::endl;\
+LOG(Debug) << " - fdim: " << (__VA_ARGS__)->_fdimensions << std::endl;\
+LOG(Debug) << " - gdim: " << (__VA_ARGS__)->_gdimensions << std::endl;\
+LOG(Debug) << " - ldim: " << (__VA_ARGS__)->_ldimensions << std::endl;\
+LOG(Debug) << " - rdim: " << (__VA_ARGS__)->_rdimensions << std::endl;
+
+template <typename VType>
+void Environment::createGrid(const unsigned int Ls)
+{
+    size_t hash = typeHash<VType>();
+
+    if (grid4d_.find({hash, 1}) == grid4d_.end())
+    {
+        grid4d_[{hash, 1}].reset(
+            SpaceTimeGrid::makeFourDimGrid(getDim(), 
+                                        GridDefaultSimd(getNd(), VType::Nsimd()),
+                                        GridDefaultMpi()));
+        HADRONS_DUMP_GRID(grid4d_[{hash, 1}].get());
+        gridRb4d_[{hash, 1}].reset(
+            SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_[{hash, 1}].get()));
+        HADRONS_DUMP_GRID(gridRb4d_[{hash, 1}].get());
+    }
+    if (grid5d_.find({hash, Ls}) == grid5d_.end())
+    {
+        auto g = grid4d_[{hash, 1}].get();
+        
+        grid5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimGrid(Ls, g));
+        HADRONS_DUMP_GRID(grid5d_[{hash, Ls}].get());
+        gridRb5d_[{hash, Ls}].reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, g));
+        HADRONS_DUMP_GRID(gridRb5d_[{hash, Ls}].get());
+    }
+}
+
+template <typename VType>
+void Environment::createCoarseGrid(const std::vector<int> &blockSize,
+                                   const unsigned int Ls)
+{
+    int              nd      = getNd();
+    std::vector<int> fineDim = getDim(), coarseDim(nd);
+    unsigned int     cLs;
+    auto             key4d = blockSize, key5d = blockSize;
+    size_t           hash  = typeHash<VType>();
+
+    createGrid(Ls);
+    for (int d = 0; d < coarseDim.size(); d++)
+    {
+        coarseDim[d] = fineDim[d]/blockSize[d];
+        if (coarseDim[d]*blockSize[d] != fineDim[d])
+        {
+            HADRONS_ERROR(Size, "Fine dimension " + std::to_string(d) 
+                         + " (" + std::to_string(fineDim[d]) 
+                         + ") not divisible by coarse dimension ("
+                         + std::to_string(coarseDim[d]) + ")"); 
+        }
+    }
+    if (blockSize.size() > nd)
+    {
+        cLs = Ls/blockSize[nd];
+        if (cLs*blockSize[nd] != Ls)
+        {
+            HADRONS_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
+                         + ") not divisible by coarse Ls ("
+                         + std::to_string(cLs) + ")");
+        }
+    }
+    else
+    {
+        cLs = Ls;
+    }
+    key4d.resize(nd);
+    key5d.push_back(Ls);
+
+    CoarseGridKey hkey4d = {hash, key4d}, hkey5d = {hash, key5d};
+
+    if (gridCoarse4d_.find(hkey4d) == gridCoarse4d_.end())
+    {
+        gridCoarse4d_[hkey4d].reset(
+            SpaceTimeGrid::makeFourDimGrid(coarseDim, 
+                GridDefaultSimd(nd, VType::Nsimd()), GridDefaultMpi()));
+        HADRONS_DUMP_GRID(gridCoarse4d_[hkey4d].get());
+    }
+    if (gridCoarse5d_.find(hkey5d) == gridCoarse5d_.end())
+    {
+        gridCoarse5d_[hkey5d].reset(
+            SpaceTimeGrid::makeFiveDimGrid(cLs, gridCoarse4d_[hkey4d].get()));
+        HADRONS_DUMP_GRID(gridCoarse5d_[hkey5d].get());
+    }
+}
+
+#undef HADRONS_DUMP_GRID
+
+template <typename VType>
+GridCartesian * Environment::getGrid(void)
+{
+    FineGridKey key = {typeHash<VType>(), 1};
+
+    auto it = grid4d_.find(key);
+
+    if (it != grid4d_.end())
+    {
+        return it->second.get();
+    }
+    else
+    {
+        createGrid<VType>(1);
+
+        return grid4d_.at(key).get();
+    }
+}
+
+template <typename VType>
+GridRedBlackCartesian * Environment::getRbGrid(void)
+{
+    FineGridKey key = {typeHash<VType>(), 1};
+    auto        it  = gridRb4d_.find(key);
+
+    if (it != gridRb4d_.end())
+    {
+        return it->second.get();
+    }
+    else
+    {
+        createGrid<VType>(1);
+
+        return gridRb4d_.at(key).get();
+    }
+}
+
+template <typename VType>
+GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize)
+{
+    std::vector<int> s = blockSize;
+
+    s.resize(getNd());
+
+    CoarseGridKey key = {typeHash<VType>(), s};
+    auto          it  = gridCoarse4d_.find(key);
+
+    if (it != gridCoarse4d_.end())
+    {
+        return it->second.get();
+    }
+    else
+    {
+        createCoarseGrid<VType>(blockSize, 1);
+        
+        return gridCoarse4d_.at(key).get();
+    }
+}
+
+template <typename VType>
+GridCartesian * Environment::getGrid(const unsigned int Ls)
+{
+    FineGridKey key = {typeHash<VType>(), Ls};
+    auto        it  = grid5d_.find(key);
+
+    if (it != grid5d_.end())
+    {
+        return it->second.get();
+    }
+    else
+    {
+        createGrid<VType>(Ls);
+
+        return grid5d_.at(key).get();
+    }
+}
+
+template <typename VType>
+GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls)
+{
+    FineGridKey key = {typeHash<VType>(), Ls};
+    auto        it  = gridRb5d_.find(key);
+
+    if (it != gridRb5d_.end())
+    {
+        return it->second.get();
+    }
+    else
+    {
+        createGrid<VType>(Ls);
+
+        return gridRb5d_.at(key).get();
+    }
+}
+
+template <typename VType>
+GridCartesian * Environment::getCoarseGrid(const std::vector<int> &blockSize,
+                                           const unsigned int Ls)
+{
+    std::vector<int> s = blockSize;
+
+    s.push_back(Ls);
+
+    CoarseGridKey key = {typeHash<VType>(), s};
+
+    auto it = gridCoarse5d_.find(key);
+    if (it != gridCoarse5d_.end())
+    {
+        return it->second.get();
+    }
+    else
+    {
+        createCoarseGrid<VType>(blockSize, Ls);
+
+        return gridCoarse5d_.at(key).get();
+    }
+}
+
+
+// general memory management ///////////////////////////////////////////////////
+template <typename B, typename T, typename ... Ts>
+void Environment::createDerivedObject(const std::string name,
+                                      const Environment::Storage storage,
+                                      const unsigned int Ls,
+                                      Ts && ... args)
+{
+    if (!hasObject(name))
+    {
+        addObject(name);
+    }
+    
+    unsigned int address = getObjectAddress(name);
+    
+    if (!object_[address].data or !objectsProtected())
+    {
+        MemoryStats memStats;
+    
+        if (!MemoryProfiler::stats)
+        {
+            MemoryProfiler::stats = &memStats;
+        }
+        size_t initMem               = MemoryProfiler::stats->currentlyAllocated;
+        object_[address].storage     = storage;
+        object_[address].Ls          = Ls;
+        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
+        object_[address].size        = MemoryProfiler::stats->maxAllocated - initMem;
+        object_[address].type        = typeIdPt<B>();
+        object_[address].derivedType = typeIdPt<T>();
+        if (MemoryProfiler::stats == &memStats)
+        {
+            MemoryProfiler::stats = nullptr;
+        }
+    }
+    // object already exists, no error if it is a cache, error otherwise
+    else if ((object_[address].storage               != Storage::cache) or 
+             (object_[address].storage               != storage)        or
+             (object_[address].name                  != name)           or
+             (typeHash(object_[address].type)        != typeHash<B>())  or
+             (typeHash(object_[address].derivedType) != typeHash<T>()))
+    {
+        HADRONS_ERROR_REF(ObjectDefinition, "object '" + name + "' already allocated", address);
+    }
+}
+
+template <typename T, typename ... Ts>
+void Environment::createObject(const std::string name, 
+                               const Environment::Storage storage,
+                               const unsigned int Ls,
+                               Ts && ... args)
+{
+    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
+}
+
+template <typename B, typename T>
+T * Environment::getDerivedObject(const unsigned int address) const
+{
+    if (hasObject(address))
+    {
+        if (hasCreatedObject(address))
+        {
+            if (auto h = dynamic_cast<Holder<B> *>(object_[address].data.get()))
+            {
+                if (&typeid(T) == &typeid(B))
+                {
+                    return dynamic_cast<T *>(h->getPt());
+                }
+                else
+                {
+                    if (auto hder = dynamic_cast<T *>(h->getPt()))
+                    {
+                        return hder;
+                    }
+                    else
+                    {
+                        HADRONS_ERROR_REF(ObjectType, "object with address " +
+                            std::to_string(address) +
+                            " cannot be casted to '" + typeName(&typeid(T)) +
+                            "' (has type '" + typeName(&typeid(h->get())) + "')", address);
+                    }
+                }
+            }
+            else
+            {
+                HADRONS_ERROR_REF(ObjectType, "object with address " + 
+                            std::to_string(address) +
+                            " does not have type '" + typeName(&typeid(B)) +
+                            "' (has type '" + getObjectType(address) + "')", address);
+            }
+        }
+        else
+        {
+            HADRONS_ERROR_REF(ObjectDefinition, "object with address " + 
+                              std::to_string(address) + " is empty", address);
+        }
+    }
+    else
+    {
+        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " + 
+                          std::to_string(address), address);
+    }
+}
+
+template <typename B, typename T>
+T * Environment::getDerivedObject(const std::string name) const
+{
+    return getDerivedObject<B, T>(getObjectAddress(name));
+}
+
+template <typename T>
+T * Environment::getObject(const unsigned int address) const
+{
+    return getDerivedObject<T, T>(address);
+}
+
+template <typename T>
+T * Environment::getObject(const std::string name) const
+{
+    return getObject<T>(getObjectAddress(name));
+}
+
+template <typename T>
+bool Environment::isObjectOfType(const unsigned int address) const
+{
+    if (hasObject(address))
+    {
+        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+    else
+    {
+        HADRONS_ERROR_REF(ObjectDefinition, "no object with address " 
+                          + std::to_string(address), address);
+    }
+}
+
+template <typename T>
+bool Environment::isObjectOfType(const std::string name) const
+{
+    return isObjectOfType<T>(getObjectAddress(name));
+}
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Environment_hpp_
diff --git a/Hadrons/Exceptions.cc b/Hadrons/Exceptions.cc
new file mode 100644
index 00000000..c9800610
--- /dev/null
+++ b/Hadrons/Exceptions.cc
@@ -0,0 +1,102 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Exceptions.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Exceptions.hpp>
+#include <Hadrons/VirtualMachine.hpp>
+#include <Hadrons/Module.hpp>
+
+#ifndef ERR_SUFF
+#define ERR_SUFF " (" + loc + ")"
+#endif
+
+#define CTOR_EXC(name, init) \
+name::name(std::string msg, std::string loc)\
+:init\
+{}
+
+#define CTOR_EXC_REF(name, init) \
+name::name(std::string msg, std::string loc, const unsigned int address)\
+:init\
+{}
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace Exceptions;
+
+// backtrace cache
+std::vector<std::string> Grid::Hadrons::Exceptions::backtraceStr;
+
+// logic errors
+CTOR_EXC(Logic, logic_error(msg + ERR_SUFF))
+CTOR_EXC(Definition, Logic("definition error: " + msg, loc))
+CTOR_EXC(Implementation, Logic("implementation error: " + msg, loc))
+CTOR_EXC(Range, Logic("range error: " + msg, loc))
+CTOR_EXC(Size, Logic("size error: " + msg, loc))
+
+// runtime errors
+CTOR_EXC(Runtime, runtime_error(msg + ERR_SUFF))
+CTOR_EXC(Argument, Runtime("argument error: " + msg, loc))
+CTOR_EXC(Io, Runtime("IO error: " + msg, loc))
+CTOR_EXC(Memory, Runtime("memory error: " + msg, loc))
+CTOR_EXC(Parsing, Runtime("parsing error: " + msg, loc))
+CTOR_EXC(Program, Runtime("program error: " + msg, loc))
+CTOR_EXC(System, Runtime("system error: " + msg, loc))
+
+// virtual machine errors
+CTOR_EXC_REF(ObjectDefinition, RuntimeRef("object definition error: " + msg, loc, address));
+CTOR_EXC_REF(ObjectType, RuntimeRef("object type error: " + msg, loc, address));
+
+// abort functions
+void Grid::Hadrons::Exceptions::abort(const std::exception& e)
+{
+    auto &vm = VirtualMachine::getInstance();
+    int  mod = vm.getCurrentModule();
+
+    LOG(Error) << "FATAL ERROR -- Exception " << typeName(&typeid(e)) 
+               << std::endl;
+    if (mod >= 0)
+    {
+        LOG(Error) << "During execution of module '"
+                    << vm.getModuleName(mod) << "' (address " << mod << ")"
+                    << std::endl;
+    }
+    LOG(Error) << e.what() << std::endl;
+    if (!backtraceStr.empty())
+    {
+        LOG(Error) << "-- BACKTRACE --------------" << std::endl;
+        for (auto &s: backtraceStr)
+        {
+            LOG(Error) << s << std::endl;
+        }
+        LOG(Error) << "---------------------------" << std::endl;
+    }
+    LOG(Error) << "Aborting program" << std::endl;
+    Grid_finalize();
+
+    exit(EXIT_FAILURE);
+}
diff --git a/Hadrons/Exceptions.hpp b/Hadrons/Exceptions.hpp
new file mode 100644
index 00000000..2a8b0f8c
--- /dev/null
+++ b/Hadrons/Exceptions.hpp
@@ -0,0 +1,129 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Exceptions.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Exceptions_hpp_
+#define Hadrons_Exceptions_hpp_
+
+#include <stdexcept>
+#include <execinfo.h>
+#ifndef Hadrons_Global_hpp_
+#include <Hadrons/Global.hpp>
+#endif
+
+#define HADRONS_SRC_LOC std::string(__FUNCTION__) + " at " \
+                        + std::string(__FILE__) + ":" + std::to_string(__LINE__)
+#define HADRONS_BACKTRACE_MAX 128
+#ifdef HAVE_EXECINFO_H
+#define HADRONS_CACHE_BACKTRACE \
+{\
+    void* _callstack[HADRONS_BACKTRACE_MAX];\
+    int _i, _frames = backtrace(_callstack, HADRONS_BACKTRACE_MAX);\
+    char** _strs = backtrace_symbols(_callstack, _frames);\
+    Grid::Hadrons::Exceptions::backtraceStr.clear();\
+    for (_i = 0; _i < _frames; ++_i)\
+    {\
+        Hadrons::Exceptions::backtraceStr.push_back(std::string(_strs[_i]));\
+    }\
+    free(_strs);\
+}
+#else
+#define HADRONS_CACHE_BACKTRACE \
+Grid::Hadrons::Exceptions::backtraceStr.clear();\
+Grid::Hadrons::Exceptions::backtraceStr.push_back("<backtrace not supported>");
+#endif
+
+#define HADRONS_ERROR(exc, msg)\
+HADRONS_CACHE_BACKTRACE \
+throw(Exceptions::exc(msg, HADRONS_SRC_LOC));
+
+#define HADRONS_ERROR_REF(exc, msg, address)\
+HADRONS_CACHE_BACKTRACE \
+throw(Exceptions::exc(msg, HADRONS_SRC_LOC, address));
+
+#define DECL_EXC(name, base) \
+class name: public base\
+{\
+public:\
+    name(std::string msg, std::string loc);\
+}
+
+#define DECL_EXC_REF(name, base) \
+class name: public base\
+{\
+public:\
+    name(std::string msg, std::string loc, const unsigned int address);\
+}
+
+BEGIN_HADRONS_NAMESPACE
+
+namespace Exceptions
+{
+    // backtrace cache
+    extern std::vector<std::string> backtraceStr;
+
+    // logic errors
+    DECL_EXC(Logic, std::logic_error);
+    DECL_EXC(Definition, Logic);
+    DECL_EXC(Implementation, Logic);
+    DECL_EXC(Range, Logic);
+    DECL_EXC(Size, Logic);
+
+    // runtime errors
+    DECL_EXC(Runtime, std::runtime_error);
+    DECL_EXC(Argument, Runtime);
+    DECL_EXC(Io, Runtime);
+    DECL_EXC(Memory, Runtime);
+    DECL_EXC(Parsing, Runtime);
+    DECL_EXC(Program, Runtime);
+    DECL_EXC(System, Runtime);
+
+    // virtual machine errors
+    class RuntimeRef: public Runtime
+    {
+    public:
+        RuntimeRef(std::string msg, std::string loc, const unsigned int address)
+        : Runtime(msg, loc), address_(address)
+        {}
+        unsigned int getAddress(void) const
+        {
+            return address_;
+        }
+    private:
+        unsigned int address_;
+    };
+
+    DECL_EXC_REF(ObjectDefinition, RuntimeRef);
+    DECL_EXC_REF(ObjectType, RuntimeRef);
+
+    // abort functions
+    void abort(const std::exception& e);
+}
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Exceptions_hpp_
diff --git a/extras/Hadrons/Factory.hpp b/Hadrons/Factory.hpp
similarity index 94%
rename from extras/Hadrons/Factory.hpp
rename to Hadrons/Factory.hpp
index 65ce03ca..8a3c20ba 100644
--- a/extras/Hadrons/Factory.hpp
+++ b/Hadrons/Factory.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Factory.hpp
+Source file: Hadrons/Factory.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,7 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_Factory_hpp_
 #define Hadrons_Factory_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
+#include <Hadrons/Global.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -95,7 +94,7 @@ std::unique_ptr<T> Factory<T>::create(const std::string type,
     }
     catch (std::out_of_range &)
     {
-        HADRON_ERROR(Argument, "object of type '" + type + "' unknown");
+        HADRONS_ERROR(Argument, "object of type '" + type + "' unknown");
     }
     
     return func(name);
diff --git a/extras/Hadrons/GeneticScheduler.hpp b/Hadrons/GeneticScheduler.hpp
similarity index 94%
rename from extras/Hadrons/GeneticScheduler.hpp
rename to Hadrons/GeneticScheduler.hpp
index 76535a74..9025641b 100644
--- a/extras/Hadrons/GeneticScheduler.hpp
+++ b/Hadrons/GeneticScheduler.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/GeneticScheduler.hpp
+Source file: Hadrons/GeneticScheduler.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_GeneticScheduler_hpp_
 #define Hadrons_GeneticScheduler_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Graph.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Graph.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -58,7 +57,9 @@ public:
     virtual ~GeneticScheduler(void) = default;
     // access
     const Gene & getMinSchedule(void);
-    int          getMinValue(void);
+    V            getMinValue(void);
+    // reset population
+    void initPopulation(void);
     // breed a new generation
     void nextGeneration(void);
     // heuristic benchmarks
@@ -77,8 +78,6 @@ public:
         return out;
     }
 private:
-    // evolution steps
-    void initPopulation(void);
     void doCrossover(void);
     void doMutation(void);
     // genetic operators
@@ -117,7 +116,7 @@ GeneticScheduler<V, T>::getMinSchedule(void)
 }
 
 template <typename V, typename T>
-int GeneticScheduler<V, T>::getMinValue(void)
+V GeneticScheduler<V, T>::getMinValue(void)
 {
     return population_.begin()->first;
 }
@@ -131,28 +130,28 @@ void GeneticScheduler<V, T>::nextGeneration(void)
     {
         initPopulation();
     }
-    LOG(Debug) << "Starting population:\n" << *this << std::endl;
+    //LOG(Debug) << "Starting population:\n" << *this << std::endl;
     
     // random mutations
     for (unsigned int i = 0; i < par_.popSize; ++i)
     {
         doMutation();
     }
-    LOG(Debug) << "After mutations:\n" << *this << std::endl;
+    //LOG(Debug) << "After mutations:\n" << *this << std::endl;
     
     // mating
     for (unsigned int i = 0; i < par_.popSize/2; ++i)
     {
         doCrossover();
     }
-    LOG(Debug) << "After mating:\n" << *this << std::endl;
+    //LOG(Debug) << "After mating:\n" << *this << std::endl;
     
     // grim reaper
     auto it = population_.begin();
     
     std::advance(it, par_.popSize);
     population_.erase(it, population_.end());
-    LOG(Debug) << "After grim reaper:\n" << *this << std::endl;
+    //LOG(Debug) << "After grim reaper:\n" << *this << std::endl;
 }
 
 // evolution steps /////////////////////////////////////////////////////////////
diff --git a/Hadrons/Global.cc b/Hadrons/Global.cc
new file mode 100644
index 00000000..d72dbbd6
--- /dev/null
+++ b/Hadrons/Global.cc
@@ -0,0 +1,214 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Global.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Global.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+HadronsLogger Hadrons::HadronsLogError(1,"Error");
+HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
+HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
+HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
+HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
+HadronsLogger Hadrons::HadronsLogIRL(1,"IRL");
+
+void Hadrons::initLogger(void)
+{
+    auto w  = std::string("Hadrons").length();
+    int  cw = 8;
+
+
+    GridLogError.setTopWidth(w);
+    GridLogWarning.setTopWidth(w);
+    GridLogMessage.setTopWidth(w);
+    GridLogIterative.setTopWidth(w);
+    GridLogDebug.setTopWidth(w);
+    GridLogIRL.setTopWidth(w);
+    GridLogError.setChanWidth(cw);
+    GridLogWarning.setChanWidth(cw);
+    GridLogMessage.setChanWidth(cw);
+    GridLogIterative.setChanWidth(cw);
+    GridLogDebug.setChanWidth(cw);
+    GridLogIRL.setChanWidth(cw);
+    HadronsLogError.Active(true);
+    HadronsLogWarning.Active(true);
+    HadronsLogMessage.Active(GridLogMessage.isActive());
+    HadronsLogIterative.Active(GridLogIterative.isActive());
+    HadronsLogDebug.Active(GridLogDebug.isActive());
+    HadronsLogIRL.Active(GridLogIRL.isActive());
+    HadronsLogError.setChanWidth(cw);
+    HadronsLogWarning.setChanWidth(cw);
+    HadronsLogMessage.setChanWidth(cw);
+    HadronsLogIterative.setChanWidth(cw);
+    HadronsLogDebug.setChanWidth(cw);
+    HadronsLogIRL.setChanWidth(cw);
+}
+
+// type utilities //////////////////////////////////////////////////////////////
+size_t Hadrons::typeHash(const std::type_info *info)
+{
+    return info->hash_code();
+}
+
+constexpr unsigned int maxNameSize = 1024u;
+
+std::string Hadrons::typeName(const std::type_info *info)
+{
+    char        *buf;
+    std::string name;
+    
+    buf  = abi::__cxa_demangle(info->name(), nullptr, nullptr, nullptr);
+    name = buf;
+    free(buf);
+    
+    return name;
+}
+
+// default writers/readers /////////////////////////////////////////////////////
+#ifdef HAVE_HDF5
+const std::string Hadrons::resultFileExt = "h5";
+#else
+const std::string Hadrons::resultFileExt = "xml";
+#endif
+
+// recursive mkdir /////////////////////////////////////////////////////////////
+int Hadrons::mkdir(const std::string dirName)
+{
+    if (!dirName.empty() and access(dirName.c_str(), R_OK|W_OK|X_OK))
+    {
+        mode_t mode755;
+        char   tmp[MAX_PATH_LENGTH];
+        char   *p = NULL;
+        size_t len;
+
+        mode755 = S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
+
+        snprintf(tmp, sizeof(tmp), "%s", dirName.c_str());
+        len = strlen(tmp);
+        if(tmp[len - 1] == '/')
+        {
+            tmp[len - 1] = 0;
+        }
+        for(p = tmp + 1; *p; p++)
+        {
+            if(*p == '/')
+            {
+                *p = 0;
+                ::mkdir(tmp, mode755);
+                *p = '/';
+            }
+        }
+
+        return ::mkdir(tmp, mode755);
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+std::string Hadrons::basename(const std::string &s)
+{
+    constexpr char sep = '/';
+    size_t         i   = s.rfind(sep, s.length());
+    
+    if (i != std::string::npos)
+    {
+        return s.substr(i+1, s.length() - i);
+    }
+    else
+    {
+        return s;
+    }
+}
+
+std::string Hadrons::dirname(const std::string &s)
+{
+    constexpr char sep = '/';
+    size_t         i   = s.rfind(sep, s.length());
+    
+    if (i != std::string::npos)
+    {
+        return s.substr(0, i);
+    }
+    else
+    {
+        return "";
+    }
+}
+
+void Hadrons::makeFileDir(const std::string filename, GridBase *g)
+{
+    bool doIt = true;
+
+    if (g)
+    {
+        doIt = g->IsBoss();
+    }
+    if (doIt)
+    {
+        std::string dir    = dirname(filename);
+        int         status = mkdir(dir);
+
+        if (status)
+        {
+            HADRONS_ERROR(Io, "cannot create directory '" + dir
+                          + "' ( " + std::strerror(errno) + ")");
+        }
+    }
+}
+
+void Hadrons::printTimeProfile(const std::map<std::string, GridTime> &timing, 
+                               GridTime total)
+{
+    typedef decltype(total.count()) Count;
+
+    std::map<Count, std::string, std::greater<Count>> rtiming;
+    const double dtotal = static_cast<double>(total.count());
+    auto cf = std::cout.flags();
+    auto p  = std::cout.precision();
+    unsigned int width = 0;
+
+    for (auto &t: timing)
+    {
+        width = std::max(width, static_cast<unsigned int>(t.first.length()));
+        rtiming[t.second.count()] = t.first;
+    }
+    for (auto &rt: rtiming)
+    {
+        LOG(Message) << std::setw(width) << rt.second << ": " 
+                     << rt.first << " us (" << std::fixed 
+                     << std::setprecision(1) 
+                     << static_cast<double>(rt.first)/dtotal*100 << "%)"
+                     << std::endl;
+    }
+    std::cout.flags(cf);
+    std::cout.precision(p);
+}
diff --git a/Hadrons/Global.hpp b/Hadrons/Global.hpp
new file mode 100644
index 00000000..aaf03427
--- /dev/null
+++ b/Hadrons/Global.hpp
@@ -0,0 +1,270 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Global.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Global_hpp_
+#define Hadrons_Global_hpp_
+
+#include <set>
+#include <stack>
+#include <regex>
+#include <Grid/Grid.h>
+#include <cxxabi.h>
+
+#ifndef SITE_SIZE_TYPE
+#define SITE_SIZE_TYPE size_t
+#endif
+
+#ifndef DEFAULT_ASCII_PREC
+#define DEFAULT_ASCII_PREC 16
+#endif
+
+/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
+ * error with GCC 5 (clang & GCC 6 compile fine without it).
+ */
+
+#define BEGIN_HADRONS_NAMESPACE \
+namespace Grid {\
+using namespace QCD;\
+namespace Hadrons {\
+using Grid::operator<<;\
+using Grid::operator>>;
+#define END_HADRONS_NAMESPACE }}
+
+#define BEGIN_MODULE_NAMESPACE(name)\
+namespace name {\
+using Grid::operator<<;\
+using Grid::operator>>;
+
+#define END_MODULE_NAMESPACE }
+
+#define _HADRONS_IMPL(impl, sub) impl##sub
+#define HADRONS_IMPL(impl, sub)   _HADRONS_IMPL(impl, sub)
+
+#ifndef FIMPLBASE
+#define FIMPLBASE WilsonImpl
+#endif
+#define FIMPL  HADRONS_IMPL(FIMPLBASE, R)
+#define FIMPLF HADRONS_IMPL(FIMPLBASE, F)
+#define FIMPLD HADRONS_IMPL(FIMPLBASE, D)
+
+#ifndef ZFIMPLBASE
+#define ZFIMPLBASE ZWilsonImpl
+#endif
+#define ZFIMPL  HADRONS_IMPL(ZFIMPLBASE, R)
+#define ZFIMPLF HADRONS_IMPL(ZFIMPLBASE, F)
+#define ZFIMPLD HADRONS_IMPL(ZFIMPLBASE, D)
+
+#ifndef SIMPLBASE
+#define SIMPLBASE ScalarImplC
+#endif
+#define SIMPL  HADRONS_IMPL(SIMPLBASE, R)
+#define SIMPLF HADRONS_IMPL(SIMPLBASE, F)
+#define SIMPLD HADRONS_IMPL(SIMPLBASE, D)
+
+#ifndef GIMPLBASE
+#define GIMPLBASE PeriodicGimpl
+#endif
+#define GIMPL  HADRONS_IMPL(GIMPLBASE, R)
+#define GIMPLF HADRONS_IMPL(GIMPLBASE, F)
+#define GIMPLD HADRONS_IMPL(GIMPLBASE, D)
+
+BEGIN_HADRONS_NAMESPACE
+
+// type aliases
+#define BASIC_TYPE_ALIASES(Impl, suffix)\
+typedef typename Impl::Field                         ScalarField##suffix;\
+typedef typename Impl::PropagatorField               PropagatorField##suffix;\
+typedef typename Impl::SitePropagator::scalar_object SitePropagator##suffix;\
+typedef std::vector<SitePropagator##suffix>          SlicedPropagator##suffix;
+
+#define FERM_TYPE_ALIASES(FImpl, suffix)\
+BASIC_TYPE_ALIASES(FImpl, suffix);\
+typedef FermionOperator<FImpl>            FMat##suffix;\
+typedef typename FImpl::FermionField      FermionField##suffix;\
+typedef typename FImpl::GaugeField        GaugeField##suffix;\
+typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;\
+typedef typename FImpl::ComplexField      ComplexField##suffix;
+
+#define GAUGE_TYPE_ALIASES(GImpl, suffix)\
+typedef typename GImpl::GaugeField GaugeField##suffix;
+
+#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
+typedef Solver<FImpl> Solver##suffix;
+
+#define SINK_TYPE_ALIASES(suffix)\
+typedef std::function<SlicedPropagator##suffix\
+                      (const PropagatorField##suffix &)> SinkFn##suffix;
+
+// logger
+class HadronsLogger: public Logger
+{
+public:
+    HadronsLogger(int on, std::string nm): Logger("Hadrons", on, nm,
+                                                  GridLogColours, "BLACK"){};
+};
+
+#define LOG(channel) std::cout << HadronsLog##channel
+#define HADRONS_DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
+
+extern HadronsLogger HadronsLogError;
+extern HadronsLogger HadronsLogWarning;
+extern HadronsLogger HadronsLogMessage;
+extern HadronsLogger HadronsLogIterative;
+extern HadronsLogger HadronsLogDebug;
+extern HadronsLogger HadronsLogIRL;
+
+void initLogger(void);
+
+// singleton pattern
+#define SINGLETON(name)\
+public:\
+    name(const name &e) = delete;\
+    void operator=(const name &e) = delete;\
+    static name & getInstance(void)\
+    {\
+        static name e;\
+        return e;\
+    }\
+private:\
+    name(void);
+
+#define SINGLETON_DEFCTOR(name)\
+public:\
+    name(const name &e) = delete;\
+    void operator=(const name &e) = delete;\
+    static name & getInstance(void)\
+    {\
+        static name e;\
+        return e;\
+    }\
+private:\
+    name(void) = default;
+
+// type utilities
+template <typename T>
+const std::type_info * typeIdPt(const T &x)
+{
+    return &typeid(x);
+}
+
+template <typename T>
+const std::type_info * typeIdPt(void)
+{
+    return &typeid(T);
+}
+
+size_t typeHash(const std::type_info *info);
+
+template <typename T>
+size_t typeHash(const T &x)
+{
+    return typeHash(typeIdPt(x));
+}
+
+template <typename T>
+size_t typeHash(void)
+{
+    return typeHash(typeIdPt<T>());
+}
+
+std::string typeName(const std::type_info *info);
+
+template <typename T>
+std::string typeName(const T &x)
+{
+    return typeName(typeIdPt(x));
+}
+
+template <typename T>
+std::string typeName(void)
+{
+    return typeName(typeIdPt<T>());
+}
+
+// default writers/readers
+extern const std::string resultFileExt;
+
+#ifdef HAVE_HDF5
+typedef Hdf5Reader ResultReader;
+typedef Hdf5Writer ResultWriter;
+#else
+typedef XmlReader ResultReader;
+typedef XmlWriter ResultWriter;
+#endif
+
+#define RESULT_FILE_NAME(name, traj) \
+name + "." + std::to_string(traj) + "." + resultFileExt
+
+// recursive mkdir
+#define MAX_PATH_LENGTH 512u
+int         mkdir(const std::string dirName);
+std::string basename(const std::string &s);
+std::string dirname(const std::string &s);
+void        makeFileDir(const std::string filename, GridBase *g = nullptr);
+
+// default Schur convention
+#ifndef HADRONS_DEFAULT_SCHUR 
+#define HADRONS_DEFAULT_SCHUR DiagTwo
+#endif
+#define _HADRONS_SCHUR_OP_(conv) Schur##conv##Operator
+#define HADRONS_SCHUR_OP(conv) _HADRONS_SCHUR_OP_(conv)
+#define HADRONS_DEFAULT_SCHUR_OP HADRONS_SCHUR_OP(HADRONS_DEFAULT_SCHUR)
+#define _HADRONS_SCHUR_SOLVE_(conv) SchurRedBlack##conv##Solve
+#define HADRONS_SCHUR_SOLVE(conv) _HADRONS_SCHUR_SOLVE_(conv)
+#define HADRONS_DEFAULT_SCHUR_SOLVE HADRONS_SCHUR_SOLVE(HADRONS_DEFAULT_SCHUR)
+#define _HADRONS_SCHUR_A2A_(conv) A2AVectorsSchur##conv
+#define HADRONS_SCHUR_A2A(conv) _HADRONS_SCHUR_A2A_(conv)
+#define HADRONS_DEFAULT_SCHUR_A2A HADRONS_SCHUR_A2A(HADRONS_DEFAULT_SCHUR)
+
+// stringify macro
+#define _HADRONS_STR(x) #x
+#define HADRONS_STR(x) _HADRONS_STR(x)
+
+// pretty print time profile
+void printTimeProfile(const std::map<std::string, GridTime> &timing, GridTime total);
+
+// token replacement utility
+template <typename T>
+void tokenReplace(std::string &str, const std::string token,
+                  const T &x, const std::string mark = "@")
+{
+    std::string fullToken = mark + token + mark;
+    
+    auto pos = str.find(fullToken);
+    if (pos != std::string::npos)
+    {
+        str.replace(pos, fullToken.size(), std::to_string(x));
+    }
+}
+
+END_HADRONS_NAMESPACE
+
+#include <Hadrons/Exceptions.hpp>
+
+#endif // Hadrons_Global_hpp_
diff --git a/extras/Hadrons/Graph.hpp b/Hadrons/Graph.hpp
similarity index 97%
rename from extras/Hadrons/Graph.hpp
rename to Hadrons/Graph.hpp
index a9c240fa..10e3bc60 100644
--- a/extras/Hadrons/Graph.hpp
+++ b/Hadrons/Graph.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Graph.hpp
+Source file: Hadrons/Graph.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,7 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_Graph_hpp_
 #define Hadrons_Graph_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
+#include <Hadrons/Global.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -185,7 +184,7 @@ void Graph<T>::removeVertex(const T &value)
     }
     else
     {
-        HADRON_ERROR(Range, "vertex does not exists");
+        HADRONS_ERROR(Range, "vertex does not exists");
     }
 
     // remove all edges containing the vertex
@@ -214,7 +213,7 @@ void Graph<T>::removeEdge(const Edge &e)
     }
     else
     {
-        HADRON_ERROR(Range, "edge does not exists");
+        HADRONS_ERROR(Range, "edge does not exists");
     }
 }
 
@@ -260,7 +259,7 @@ void Graph<T>::mark(const T &value, const bool doMark)
     }
     else
     {
-        HADRON_ERROR(Range, "vertex does not exists");
+        HADRONS_ERROR(Range, "vertex does not exists");
     }
 }
 
@@ -298,7 +297,7 @@ bool Graph<T>::isMarked(const T &value) const
     }
     else
     {
-        HADRON_ERROR(Range, "vertex does not exists");
+        HADRONS_ERROR(Range, "vertex does not exists");
         
         return false;
     }
@@ -544,7 +543,7 @@ std::vector<T> Graph<T>::topoSort(void)
     {
         if (tmpMarked.at(v))
         {
-            HADRON_ERROR(Range, "cannot topologically sort a cyclic graph");
+            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
         }
         if (!isMarked(v))
         {
@@ -603,7 +602,7 @@ std::vector<T> Graph<T>::topoSort(Gen &gen)
     {
         if (tmpMarked.at(v))
         {
-            HADRON_ERROR(Range, "cannot topologically sort a cyclic graph");
+            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
         }
         if (!isMarked(v))
         {
diff --git a/extras/Hadrons/Makefile.am b/Hadrons/Makefile.am
similarity index 58%
rename from extras/Hadrons/Makefile.am
rename to Hadrons/Makefile.am
index 3d07679a..b748085a 100644
--- a/extras/Hadrons/Makefile.am
+++ b/Hadrons/Makefile.am
@@ -1,20 +1,27 @@
+SUBDIRS = . Utilities
+
 lib_LIBRARIES = libHadrons.a
-bin_PROGRAMS  = HadronsXmlRun HadronsXmlSchedule
 
 include modules.inc
 
 libHadrons_a_SOURCES = \
-    $(modules_cc)      \
     Application.cc     \
     Environment.cc     \
 	Exceptions.cc      \
     Global.cc          \
     Module.cc		   \
-	VirtualMachine.cc
-libHadrons_adir = $(pkgincludedir)/Hadrons
+	TimerArray.cc      \
+	VirtualMachine.cc  \
+	$(modules_cc)
+	
+libHadrons_adir = $(includedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
-	$(modules_hpp)            \
+	A2AVectors.hpp            \
+	A2AMatrix.hpp             \
 	Application.hpp           \
+	DilutedNoise.hpp          \
+	DiskVector.hpp            \
+	EigenPack.hpp             \
 	Environment.hpp           \
 	Exceptions.hpp            \
 	Factory.hpp               \
@@ -24,10 +31,7 @@ nobase_libHadrons_a_HEADERS = \
 	Module.hpp                \
 	Modules.hpp               \
 	ModuleFactory.hpp         \
-	VirtualMachine.hpp
-
-HadronsXmlRun_SOURCES = HadronsXmlRun.cc
-HadronsXmlRun_LDADD   = libHadrons.a -lGrid
-
-HadronsXmlSchedule_SOURCES = HadronsXmlSchedule.cc
-HadronsXmlSchedule_LDADD   = libHadrons.a -lGrid
+	Solver.hpp                \
+	TimerArray.hpp            \
+	VirtualMachine.hpp        \
+	$(modules_hpp)
diff --git a/extras/Hadrons/Module.cc b/Hadrons/Module.cc
similarity index 68%
rename from extras/Hadrons/Module.cc
rename to Hadrons/Module.cc
index 93e9316f..c72f8acb 100644
--- a/extras/Hadrons/Module.cc
+++ b/Hadrons/Module.cc
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Module.cc
+Source file: Hadrons/Module.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -27,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Module.hpp>
+#include <Hadrons/Module.hpp>
 
 using namespace Grid;
  
@@ -50,13 +49,47 @@ std::string ModuleBase::getName(void) const
 // get factory registration name if available
 std::string ModuleBase::getRegisteredName(void)
 {
-    HADRON_ERROR(Definition, "module '" + getName() + "' has no registered type"
+    HADRONS_ERROR(Definition, "module '" + getName() + "' has no registered type"
                  + " in the factory");
 }
 
 // execution ///////////////////////////////////////////////////////////////////
 void ModuleBase::operator()(void)
 {
+    resetTimers();
+    startTimer("_total");
+    startTimer("_setup");
     setup();
+    stopTimer("_setup");
+    startTimer("_execute");
     execute();
+    stopAllTimers();
+}
+
+std::string ModuleBase::makeSeedString(void)
+{
+    std::string seed;
+
+    if (!vm().getRunId().empty())
+    {
+        seed += vm().getRunId() + "-";
+    }
+    seed += getName() + "-" + std::to_string(vm().getTrajectory());
+
+    return seed;
+}
+
+GridParallelRNG & ModuleBase::rng4d(void)
+{
+    auto &r = *env().get4dRng();
+
+    if (makeSeedString() != seed_)
+    {
+        seed_ = makeSeedString();
+        LOG(Message) << "Seeding 4D RNG " << &r << " with string '" 
+                     << seed_ << "'" << std::endl;
+        r.SeedUniqueString(seed_);
+    }
+
+    return r;
 }
diff --git a/extras/Hadrons/Module.hpp b/Hadrons/Module.hpp
similarity index 69%
rename from extras/Hadrons/Module.hpp
rename to Hadrons/Module.hpp
index b71f779d..8d59d06c 100644
--- a/extras/Hadrons/Module.hpp
+++ b/Hadrons/Module.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Module.hpp
+Source file: Hadrons/Module.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,38 +29,14 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_Module_hpp_
 #define Hadrons_Module_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/VirtualMachine.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/TimerArray.hpp>
+#include <Hadrons/VirtualMachine.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
 // module registration macros
-#define MODULE_REGISTER(mod, base)\
-class mod: public base\
-{\
-public:\
-    typedef base Base;\
-    using Base::Base;\
-    virtual std::string getRegisteredName(void)\
-    {\
-        return std::string(#mod);\
-    }\
-};\
-class mod##ModuleRegistrar\
-{\
-public:\
-    mod##ModuleRegistrar(void)\
-    {\
-        ModuleFactory &modFac = ModuleFactory::getInstance();\
-        modFac.registerBuilder(#mod, [&](const std::string name)\
-                              {\
-                                  return std::unique_ptr<mod>(new mod(name));\
-                              });\
-    }\
-};\
-static mod##ModuleRegistrar mod##ModuleRegistrarInstance;
-
-#define MODULE_REGISTER_NS(mod, base, ns)\
+#define MODULE_REGISTER(mod, base, ns)\
 class mod: public base\
 {\
 public:\
@@ -86,12 +61,38 @@ public:\
 };\
 static ns##mod##ModuleRegistrar ns##mod##ModuleRegistrarInstance;
 
+#define MODULE_REGISTER_TMP(mod, base, ns)\
+extern template class base;\
+MODULE_REGISTER(mod, ARG(base), ns);
+
 #define ARG(...) __VA_ARGS__
-#define MACRO_REDIRECT(arg1, arg2, arg3, macro, ...) macro
+#define HADRONS_MACRO_REDIRECT_12(arg1, arg2, macro, ...) macro
+#define HADRONS_MACRO_REDIRECT_23(arg1, arg2, arg3, macro, ...) macro
+
+#define envGetGrid4(latticeType)\
+env().template getGrid<typename latticeType::vector_type>()
+
+#define envGetGrid5(latticeType, Ls)\
+env().template getGrid<typename latticeType::vector_type>(Ls)
+
+#define envGetGrid(...)\
+HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetGrid5, envGetGrid4)(__VA_ARGS__)
+
+#define envGetRbGrid4(latticeType)\
+env().template getRbGrid<typename latticeType::vector_type>()
+
+#define envGetRbGrid5(latticeType, Ls)\
+env().template getRbGrid<typename latticeType::vector_type>(Ls)
+
+#define envGetRbGrid(...)\
+HADRONS_MACRO_REDIRECT_12(__VA_ARGS__, envGetRbGrid5, envGetRbGrid4)(__VA_ARGS__)
 
 #define envGet(type, name)\
 *env().template getObject<type>(name)
 
+#define envGetDerived(base, type, name)\
+*env().template getDerivedObject<base, type>(name)
+
 #define envGetTmp(type, var)\
 type &var = *env().template getObject<type>(getName() + "_tmp_" + #var)
 
@@ -105,44 +106,54 @@ env().template createObject<type>(name, Environment::Storage::object, Ls, __VA_A
 env().template createDerivedObject<base, type>(name, Environment::Storage::object, Ls, __VA_ARGS__)
 
 #define envCreateLat4(type, name)\
-envCreate(type, name, 1, env().getGrid())
+envCreate(type, name, 1, envGetGrid(type))
 
 #define envCreateLat5(type, name, Ls)\
-envCreate(type, name, Ls, env().getGrid(Ls))
+envCreate(type, name, Ls, envGetGrid(type, Ls))
 
 #define envCreateLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCreateLat5, envCreateLat4)(__VA_ARGS__)
 
 #define envCache(type, name, Ls, ...)\
 env().template createObject<type>(name, Environment::Storage::cache, Ls, __VA_ARGS__)
 
 #define envCacheLat4(type, name)\
-envCache(type, name, 1, env().getGrid())
+envCache(type, name, 1, envGetGrid(type))
 
 #define envCacheLat5(type, name, Ls)\
-envCache(type, name, Ls, env().getGrid(Ls))
+envCache(type, name, Ls, envGetGrid(type, Ls))
 
 #define envCacheLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envCacheLat5, envCacheLat4)(__VA_ARGS__)
 
 #define envTmp(type, name, Ls, ...)\
 env().template createObject<type>(getName() + "_tmp_" + name,         \
                                   Environment::Storage::temporary, Ls, __VA_ARGS__)
 
 #define envTmpLat4(type, name)\
-envTmp(type, name, 1, env().getGrid())
+envTmp(type, name, 1, envGetGrid(type))
 
 #define envTmpLat5(type, name, Ls)\
-envTmp(type, name, Ls, env().getGrid(Ls))
+envTmp(type, name, Ls, envGetGrid(type, Ls))
 
 #define envTmpLat(...)\
-MACRO_REDIRECT(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
+HADRONS_MACRO_REDIRECT_23(__VA_ARGS__, envTmpLat5, envTmpLat4)(__VA_ARGS__)
+
+#define saveResult(ioStem, name, result)\
+if (env().getGrid()->IsBoss() and !ioStem.empty())\
+{\
+    makeFileDir(ioStem, env().getGrid());\
+    {\
+        ResultWriter _writer(RESULT_FILE_NAME(ioStem, vm().getTrajectory()));\
+        write(_writer, name, result);\
+    }\
+}
 
 /******************************************************************************
  *                            Module class                                    *
  ******************************************************************************/
 // base class
-class ModuleBase
+class ModuleBase: public TimerArray
 {
 public:
     // constructor
@@ -163,6 +174,8 @@ public:
     // parse parameters
     virtual void parseParameters(XmlReader &reader, const std::string name) = 0;
     virtual void saveParameters(XmlWriter &writer, const std::string name) = 0;
+    // parameter string
+    virtual std::string parString(void) const = 0;
     // setup
     virtual void setup(void) {};
     virtual void execute(void) = 0;
@@ -173,8 +186,13 @@ protected:
     DEFINE_ENV_ALIAS;
     // virtual machine shortcut
     DEFINE_VM_ALIAS;
+    // RNG seeded from module string
+    GridParallelRNG &rng4d(void);
 private:
-    std::string name_;
+    std::string makeSeedString(void);
+private:
+    std::string                          name_, currentTimer_, seed_;
+    std::map<std::string, GridStopWatch> timer_; 
 };
 
 // derived class, templating the parameter class
@@ -191,9 +209,11 @@ public:
     // parse parameters
     virtual void parseParameters(XmlReader &reader, const std::string name);
     virtual void saveParameters(XmlWriter &writer, const std::string name);
+    // parameter string
+    virtual std::string parString(void) const;
     // parameter access
-    const P & par(void) const;
-    void      setPar(const P &par);
+    const P &   par(void) const;
+    void        setPar(const P &par);
 private:
     P par_;
 };
@@ -216,6 +236,8 @@ public:
         push(writer, "options");
         pop(writer);
     };
+    // parameter string (empty)
+    virtual std::string parString(void) const {return "";};
 };
 
 /******************************************************************************
@@ -238,6 +260,16 @@ void Module<P>::saveParameters(XmlWriter &writer, const std::string name)
     write(writer, name, par_);
 }
 
+template <typename P>
+std::string Module<P>::parString(void) const
+{
+    XmlWriter writer("", "");
+
+    write(writer, par_.SerialisableClassName(), par_);
+
+    return writer.string();
+}
+
 template <typename P>
 const P & Module<P>::par(void) const
 {
diff --git a/extras/Hadrons/ModuleFactory.hpp b/Hadrons/ModuleFactory.hpp
similarity index 89%
rename from extras/Hadrons/ModuleFactory.hpp
rename to Hadrons/ModuleFactory.hpp
index 48ab305c..7dcbac75 100644
--- a/extras/Hadrons/ModuleFactory.hpp
+++ b/Hadrons/ModuleFactory.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/ModuleFactory.hpp
+Source file: Hadrons/ModuleFactory.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,9 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_ModuleFactory_hpp_
 #define Hadrons_ModuleFactory_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Factory.hpp>
-#include <Grid/Hadrons/Module.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Factory.hpp>
+#include <Hadrons/Module.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
diff --git a/Hadrons/Modules.hpp b/Hadrons/Modules.hpp
new file mode 100644
index 00000000..787fecea
--- /dev/null
+++ b/Hadrons/Modules.hpp
@@ -0,0 +1,74 @@
+#include <Hadrons/Modules/MContraction/Baryon.hpp>
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
+#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
+#include <Hadrons/Modules/MContraction/Meson.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
+#include <Hadrons/Modules/MContraction/DiscLoop.hpp>
+#include <Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
+#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
+#include <Hadrons/Modules/MContraction/WardIdentity.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
+#include <Hadrons/Modules/MFermion/FreeProp.hpp>
+#include <Hadrons/Modules/MFermion/GaugeProp.hpp>
+#include <Hadrons/Modules/MSource/SeqGamma.hpp>
+#include <Hadrons/Modules/MSource/Point.hpp>
+#include <Hadrons/Modules/MSource/Wall.hpp>
+#include <Hadrons/Modules/MSource/Z2.hpp>
+#include <Hadrons/Modules/MSource/SeqConserved.hpp>
+#include <Hadrons/Modules/MSource/Momentum.hpp>
+#include <Hadrons/Modules/MSink/Smear.hpp>
+#include <Hadrons/Modules/MSink/Point.hpp>
+#include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
+#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
+#include <Hadrons/Modules/MSolver/Guesser.hpp>
+#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
+#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
+#include <Hadrons/Modules/MGauge/UnitEm.hpp>
+#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
+#include <Hadrons/Modules/MGauge/Unit.hpp>
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
+#include <Hadrons/Modules/MGauge/Random.hpp>
+#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
+#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
+#include <Hadrons/Modules/MGauge/StochEm.hpp>
+#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
+#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
+#include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
+#include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
+#include <Hadrons/Modules/MLoop/NoiseLoop.hpp>
+#include <Hadrons/Modules/MScalar/FreeProp.hpp>
+#include <Hadrons/Modules/MScalar/VPCounterTerms.hpp>
+#include <Hadrons/Modules/MScalar/ScalarVP.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
+#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Hadrons/Modules/MNPR/Bilinear.hpp>
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
+#include <Hadrons/Modules/MNPR/FourQuark.hpp>
+#include <Hadrons/Modules/MAction/DWF.hpp>
+#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
+#include <Hadrons/Modules/MAction/Wilson.hpp>
+#include <Hadrons/Modules/MAction/WilsonClover.hpp>
+#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
+#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
+#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
+#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
+#include <Hadrons/Modules/MScalarSUN/ShiftProbe.hpp>
+#include <Hadrons/Modules/MScalarSUN/Div.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
+#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
+#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
+#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
+#include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
+#include <Hadrons/Modules/MIO/LoadNersc.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+#include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
+#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
+#include <Hadrons/Modules/MIO/LoadBinary.hpp>
diff --git a/extras/Hadrons/make_module_list.sh b/Hadrons/Modules/MAction/DWF.cc
old mode 100755
new mode 100644
similarity index 55%
rename from extras/Hadrons/make_module_list.sh
rename to Hadrons/Modules/MAction/DWF.cc
index 8c6fa4da..38d25cb9
--- a/extras/Hadrons/make_module_list.sh
+++ b/Hadrons/Modules/MAction/DWF.cc
@@ -1,21 +1,10 @@
-#!/usr/bin/env bash
-
-echo 'modules_cc =\' > modules.inc
-find Modules -name '*.cc' -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> modules.inc
-echo '' >> modules.inc
-echo 'modules_hpp =\' >> modules.inc
-find Modules -name '*.hpp' -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> modules.inc
-echo '' >> modules.inc
-rm -f Modules.hpp
-echo "/*************************************************************************************
+/*************************************************************************************
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules.hpp
+Source file: Hadrons/Modules/MAction/DWF.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -33,10 +22,16 @@ You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
-See the full license in the file \"LICENSE\" in the top level distribution directory
+See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-" > Modules.hpp
-for f in `find Modules -name '*.hpp'`; do
-	echo "#include <Grid/Hadrons/${f}>" >> Modules.hpp
-done
+#include <Hadrons/Modules/MAction/DWF.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MAction;
+
+template class Grid::Hadrons::MAction::TDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MAction::TDWF<FIMPLF>;
+#endif
diff --git a/extras/Hadrons/Modules/MAction/DWF.hpp b/Hadrons/Modules/MAction/DWF.hpp
similarity index 79%
rename from extras/Hadrons/Modules/MAction/DWF.hpp
rename to Hadrons/Modules/MAction/DWF.hpp
index d99f1165..a7104b42 100644
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/Hadrons/Modules/MAction/DWF.hpp
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MAction/DWF.hpp
+Source file: Hadrons/Modules/MAction/DWF.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MAction_DWF_hpp_
 #define Hadrons_MAction_DWF_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -49,19 +49,20 @@ public:
                                     unsigned int, Ls,
                                     double      , mass,
                                     double      , M5,
-                                    std::string , boundary);
+                                    std::string , boundary,
+                                    std::string , twist);
 };
 
 template <typename FImpl>
 class TDWF: public Module<DWFPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TDWF(const std::string name);
     // destructor
-    virtual ~TDWF(void) = default;
+    virtual ~TDWF(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -72,7 +73,10 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(DWF, TDWF<FIMPL>, MAction);
+MODULE_REGISTER_TMP(DWF, TDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(DWFF, TDWF<FIMPLF>, MAction);
+#endif
 
 /******************************************************************************
  *                        DWF template implementation                         *
@@ -111,14 +115,14 @@ void TDWF<FImpl>::setup(void)
     LOG(Message) << "Fermion boundary conditions: " << par().boundary
                  << std::endl;
                  
-    env().createGrid(par().Ls);
-    auto &U    = envGet(LatticeGaugeField, par().gauge);
-    auto &g4   = *env().getGrid();
-    auto &grb4 = *env().getRbGrid();
-    auto &g5   = *env().getGrid(par().Ls);
-    auto &grb5 = *env().getRbGrid(par().Ls);
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename DomainWallFermion<FImpl>::ImplParams implParams(boundary);
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
+    typename DomainWallFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
     envCreateDerived(FMat, DomainWallFermion<FImpl>, getName(), par().Ls, U, g5,
                      grb5, g4, grb4, par().mass, par().M5, implParams);
 }
diff --git a/Hadrons/Modules/MAction/MobiusDWF.cc b/Hadrons/Modules/MAction/MobiusDWF.cc
new file mode 100644
index 00000000..879452d8
--- /dev/null
+++ b/Hadrons/Modules/MAction/MobiusDWF.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/MobiusDWF.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MAction;
+
+template class Grid::Hadrons::MAction::TMobiusDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MAction::TMobiusDWF<FIMPLF>;
+#endif
diff --git a/Hadrons/Modules/MAction/MobiusDWF.hpp b/Hadrons/Modules/MAction/MobiusDWF.hpp
new file mode 100644
index 00000000..0ba9c4c3
--- /dev/null
+++ b/Hadrons/Modules/MAction/MobiusDWF.hpp
@@ -0,0 +1,140 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/MobiusDWF.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MAction_MobiusDWF_hpp_
+#define Hadrons_MAction_MobiusDWF_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                      Mobius domain-wall fermion action                     *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MAction)
+
+class MobiusDWFPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(MobiusDWFPar,
+                                    std::string , gauge,
+                                    unsigned int, Ls,
+                                    double      , mass,
+                                    double      , M5,
+                                    double      , b,
+                                    double      , c,
+                                    std::string , boundary,
+                                    std::string , twist);
+};
+
+template <typename FImpl>
+class TMobiusDWF: public Module<MobiusDWFPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TMobiusDWF(const std::string name);
+    // destructor
+    virtual ~TMobiusDWF(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(MobiusDWF, TMobiusDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(MobiusDWFF, TMobiusDWF<FIMPLF>, MAction);
+#endif
+
+/******************************************************************************
+ *                      TMobiusDWF implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TMobiusDWF<FImpl>::TMobiusDWF(const std::string name)
+: Module<MobiusDWFPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TMobiusDWF<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TMobiusDWF<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TMobiusDWF<FImpl>::setup(void)
+{
+    LOG(Message) << "Setting up Mobius domain wall fermion matrix with m= "
+                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
+                 << ", b= " << par().b << ", c= " << par().c
+                 << " using gauge field '" << par().gauge << "'"
+                 << std::endl;
+    LOG(Message) << "Fermion boundary conditions: " << par().boundary
+                 << std::endl;
+
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
+    typename MobiusFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
+    envCreateDerived(FMat, MobiusFermion<FImpl>, getName(), par().Ls, U, g5,
+                     grb5, g4, grb4, par().mass, par().M5, par().b, par().c,
+                     implParams);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TMobiusDWF<FImpl>::execute(void)
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MAction_MobiusDWF_hpp_
diff --git a/Hadrons/Modules/MAction/ScaledDWF.cc b/Hadrons/Modules/MAction/ScaledDWF.cc
new file mode 100644
index 00000000..7008bf5d
--- /dev/null
+++ b/Hadrons/Modules/MAction/ScaledDWF.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/ScaledDWF.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MAction;
+
+template class Grid::Hadrons::MAction::TScaledDWF<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MAction::TScaledDWF<FIMPLF>;
+#endif
diff --git a/Hadrons/Modules/MAction/ScaledDWF.hpp b/Hadrons/Modules/MAction/ScaledDWF.hpp
new file mode 100644
index 00000000..3b8066e6
--- /dev/null
+++ b/Hadrons/Modules/MAction/ScaledDWF.hpp
@@ -0,0 +1,139 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/ScaledDWF.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MAction_ScaledDWF_hpp_
+#define Hadrons_MAction_ScaledDWF_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                      Scaled domain wall fermion                            *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MAction)
+
+class ScaledDWFPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ScaledDWFPar,
+                                    std::string , gauge,
+                                    unsigned int, Ls,
+                                    double      , mass,
+                                    double      , M5,
+                                    double      , scale,
+                                    std::string , boundary,
+                                    std::string , twist);
+};
+
+template <typename FImpl>
+class TScaledDWF: public Module<ScaledDWFPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TScaledDWF(const std::string name);
+    // destructor
+    virtual ~TScaledDWF(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(ScaledDWF, TScaledDWF<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(ScaledDWFF, TScaledDWF<FIMPLF>, MAction);
+#endif
+
+/******************************************************************************
+ *                      TScaledDWF implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TScaledDWF<FImpl>::TScaledDWF(const std::string name)
+: Module<ScaledDWFPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TScaledDWF<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TScaledDWF<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TScaledDWF<FImpl>::setup(void)
+{
+    LOG(Message) << "Setting up scaled domain wall fermion matrix with m= "
+                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
+                 << ", scale= " << par().scale
+                 << " using gauge field '" << par().gauge << "'"
+                 << std::endl;
+    LOG(Message) << "Fermion boundary conditions: " << par().boundary
+                 << std::endl;
+
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
+    typename ScaledShamirFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
+    envCreateDerived(FMat, ScaledShamirFermion<FImpl>, getName(), par().Ls, U, g5,
+                     grb5, g4, grb4, par().mass, par().M5, par().scale,
+                     implParams);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TScaledDWF<FImpl>::execute(void)
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MAction_ScaledDWF_hpp_
diff --git a/Hadrons/Modules/MAction/Wilson.cc b/Hadrons/Modules/MAction/Wilson.cc
new file mode 100644
index 00000000..1e801ed6
--- /dev/null
+++ b/Hadrons/Modules/MAction/Wilson.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/Wilson.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MAction/Wilson.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MAction;
+
+template class Grid::Hadrons::MAction::TWilson<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MAction::TWilson<FIMPLF>;
+#endif
diff --git a/extras/Hadrons/Modules/MAction/Wilson.hpp b/Hadrons/Modules/MAction/Wilson.hpp
similarity index 77%
rename from extras/Hadrons/Modules/MAction/Wilson.hpp
rename to Hadrons/Modules/MAction/Wilson.hpp
index 8ef755bb..b5e53837 100644
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/Hadrons/Modules/MAction/Wilson.hpp
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MAction/Wilson.hpp
+Source file: Hadrons/Modules/MAction/Wilson.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MAction_Wilson_hpp_
 #define Hadrons_MAction_Wilson_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -47,19 +47,21 @@ public:
     GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonPar,
                                     std::string, gauge,
                                     double     , mass,
-                                    std::string, boundary);
+                                    std::string, boundary,
+                                    std::string, string,
+                                    std::string, twist);
 };
 
 template <typename FImpl>
 class TWilson: public Module<WilsonPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TWilson(const std::string name);
     // destructor
-    virtual ~TWilson(void) = default;
+    virtual ~TWilson(void) {};
     // dependencies/products
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -70,7 +72,10 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Wilson, TWilson<FIMPL>, MAction);
+MODULE_REGISTER_TMP(Wilson, TWilson<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(WilsonF, TWilson<FIMPLF>, MAction);
+#endif
 
 /******************************************************************************
  *                     TWilson template implementation                        *
@@ -102,16 +107,17 @@ std::vector<std::string> TWilson<FImpl>::getOutput(void)
 template <typename FImpl>
 void TWilson<FImpl>::setup(void)
 {
-    LOG(Message) << "Setting up TWilson fermion matrix with m= " << par().mass
+    LOG(Message) << "Setting up Wilson fermion matrix with m= " << par().mass
                  << " using gauge field '" << par().gauge << "'" << std::endl;
     LOG(Message) << "Fermion boundary conditions: " << par().boundary
                  << std::endl;
                  
-    auto &U      = envGet(LatticeGaugeField, par().gauge);
-    auto &grid   = *env().getGrid();
-    auto &gridRb = *env().getRbGrid();
-    std::vector<Complex> boundary = strToVec<Complex>(par().boundary);
-    typename WilsonFermion<FImpl>::ImplParams implParams(boundary);
+    auto &U      = envGet(GaugeField, par().gauge);
+    auto &grid   = *envGetGrid(FermionField);
+    auto &gridRb = *envGetRbGrid(FermionField);
+    typename WilsonFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
     envCreateDerived(FMat, WilsonFermion<FImpl>, getName(), 1, U, grid, gridRb,
                      par().mass, implParams);
 }
diff --git a/Hadrons/Modules/MAction/WilsonClover.cc b/Hadrons/Modules/MAction/WilsonClover.cc
new file mode 100644
index 00000000..eed1582c
--- /dev/null
+++ b/Hadrons/Modules/MAction/WilsonClover.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/WilsonClover.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MAction/WilsonClover.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MAction;
+
+template class Grid::Hadrons::MAction::TWilsonClover<FIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MAction::TWilsonClover<FIMPLF>;
+#endif
diff --git a/Hadrons/Modules/MAction/WilsonClover.hpp b/Hadrons/Modules/MAction/WilsonClover.hpp
new file mode 100644
index 00000000..ad301380
--- /dev/null
+++ b/Hadrons/Modules/MAction/WilsonClover.hpp
@@ -0,0 +1,140 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/WilsonClover.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: pretidav <david.preti@csic.es>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MAction_WilsonClover_hpp_
+#define Hadrons_MAction_WilsonClover_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Wilson clover quark action                         *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MAction)
+
+class WilsonCloverPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverPar,
+                                    std::string, gauge,
+                                    double     , mass,
+				                    double     , csw_r,
+				                    double     , csw_t,
+				                    WilsonAnisotropyCoefficients ,clover_anisotropy,
+                                    std::string, boundary,
+                                    std::string, twist
+				    );
+};
+
+template <typename FImpl>
+class TWilsonClover: public Module<WilsonCloverPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TWilsonClover(const std::string name);
+    // destructor
+    virtual ~TWilsonClover(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(WilsonClover, TWilsonClover<FIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(WilsonCloverF, TWilsonClover<FIMPLF>, MAction);
+#endif
+
+/******************************************************************************
+ *                    TWilsonClover template implementation                   *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TWilsonClover<FImpl>::TWilsonClover(const std::string name)
+: Module<WilsonCloverPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TWilsonClover<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TWilsonClover<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TWilsonClover<FImpl>::setup(void)
+{
+    LOG(Message) << "Setting up Wilson clover fermion matrix with m= " << par().mass
+                 << " using gauge field '" << par().gauge << "'" << std::endl;
+    LOG(Message) << "Fermion boundary conditions: " << par().boundary 
+                 << std::endl;
+    LOG(Message) << "Clover term csw_r: " << par().csw_r
+                 << " csw_t: " << par().csw_t
+                 << std::endl;
+    auto &U      = envGet(GaugeField, par().gauge);
+    auto &grid   = *envGetGrid(FermionField);
+    auto &gridRb = *envGetRbGrid(FermionField);
+    typename WilsonCloverFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
+    envCreateDerived(FMat, WilsonCloverFermion<FImpl>, getName(), 1, U, grid,
+                     gridRb, par().mass, par().csw_r, par().csw_t, 
+                     par().clover_anisotropy, implParams); 
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TWilsonClover<FImpl>::execute()
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_WilsonClover_hpp_
diff --git a/Hadrons/Modules/MAction/ZMobiusDWF.cc b/Hadrons/Modules/MAction/ZMobiusDWF.cc
new file mode 100644
index 00000000..609b76cc
--- /dev/null
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/ZMobiusDWF.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MAction;
+
+template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPL>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MAction::TZMobiusDWF<ZFIMPLF>;
+#endif
diff --git a/Hadrons/Modules/MAction/ZMobiusDWF.hpp b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
new file mode 100644
index 00000000..12ad82ea
--- /dev/null
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -0,0 +1,148 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MAction/ZMobiusDWF.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MAction_ZMobiusDWF_hpp_
+#define Hadrons_MAction_ZMobiusDWF_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                      z-Mobius domain-wall fermion action                   *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MAction)
+
+class ZMobiusDWFPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ZMobiusDWFPar,
+                                    std::string                      , gauge,
+                                    unsigned int                     , Ls,
+                                    double                           , mass,
+                                    double                           , M5,
+                                    double                           , b,
+                                    double                           , c,
+                                    std::vector<std::complex<double>>, omega,
+                                    std::string                      , boundary,
+                                    std::string                      , twist);
+};
+
+template <typename FImpl>
+class TZMobiusDWF: public Module<ZMobiusDWFPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TZMobiusDWF(const std::string name);
+    // destructor
+    virtual ~TZMobiusDWF(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(ZMobiusDWF, TZMobiusDWF<ZFIMPL>, MAction);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(ZMobiusDWFF, TZMobiusDWF<ZFIMPLF>, MAction);
+#endif
+
+/******************************************************************************
+ *                     TZMobiusDWF implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TZMobiusDWF<FImpl>::TZMobiusDWF(const std::string name)
+: Module<ZMobiusDWFPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TZMobiusDWF<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TZMobiusDWF<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TZMobiusDWF<FImpl>::setup(void)
+{
+    LOG(Message) << "Setting up z-Mobius domain wall fermion matrix with m= "
+                 << par().mass << ", M5= " << par().M5 << ", Ls= " << par().Ls 
+                 << ", b= " << par().b << ", c= " << par().c
+                 << " using gauge field '" << par().gauge << "'"
+                 << std::endl;
+    LOG(Message) << "Omegas: " << std::endl;
+    for (unsigned int i = 0; i < par().omega.size(); ++i)
+    {
+        LOG(Message) << "  omega[" << i << "]= " << par().omega[i] << std::endl;
+    }
+    LOG(Message) << "Fermion boundary conditions: " << par().boundary
+                 << std::endl;
+
+    env().createGrid(par().Ls);
+    auto &U    = envGet(GaugeField, par().gauge);
+    auto &g4   = *envGetGrid(FermionField);
+    auto &grb4 = *envGetRbGrid(FermionField);
+    auto &g5   = *envGetGrid(FermionField, par().Ls);
+    auto &grb5 = *envGetRbGrid(FermionField, par().Ls);
+    auto omega = par().omega;
+    typename ZMobiusFermion<FImpl>::ImplParams implParams;
+    implParams.boundary_phases = strToVec<Complex>(par().boundary);
+    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
+    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
+                     grb5, g4, grb4, par().mass, par().M5, omega,
+                     par().b, par().c, implParams);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TZMobiusDWF<FImpl>::execute(void)
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MAction_ZMobiusDWF_hpp_
diff --git a/Hadrons/Modules/MContraction/A2AAslashField.cc b/Hadrons/Modules/MContraction/A2AAslashField.cc
new file mode 100644
index 00000000..65c49198
--- /dev/null
+++ b/Hadrons/Modules/MContraction/A2AAslashField.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/A2AAslashField.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TA2AAslashField<FIMPL, PhotonR>;
diff --git a/Hadrons/Modules/MContraction/A2AAslashField.hpp b/Hadrons/Modules/MContraction/A2AAslashField.hpp
new file mode 100644
index 00000000..8b99692b
--- /dev/null
+++ b/Hadrons/Modules/MContraction/A2AAslashField.hpp
@@ -0,0 +1,246 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/A2AAslashField.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MContraction_A2AAslashField_hpp_
+#define Hadrons_MContraction_A2AAslashField_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         A2AAslashField                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class A2AAslashFieldPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldPar,
+                                    int, cacheBlock,
+                                    int, block,
+                                    std::string, left,
+                                    std::string, right,
+                                    std::string, output,
+                                    std::vector<std::string>, emField);
+};
+
+class A2AAslashFieldMetadata: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashFieldMetadata,
+                                    std::string, emFieldName);
+};
+
+template <typename T, typename FImpl>
+class AslashFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    AslashFieldKernel(const std::vector<LatticeComplex> &emB0,
+                      const std::vector<LatticeComplex> &emB1,
+                      GridBase *grid)
+    : emB0_(emB0), emB1_(emB1), grid_(grid)
+    {
+        vol_ = 1.;
+        for (auto &d: grid_->GlobalDimensions())
+        {
+            vol_ *= d;
+        }
+    }
+
+    virtual ~AslashFieldKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
+    {
+        A2Autils<FImpl>::AslashField(m, left, right, emB0_, emB1_, orthogDim, &t);
+    }
+
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return 0.;
+    }
+
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return 0.;
+    }
+private:
+    const std::vector<LatticeComplex> &emB0_, &emB1_;
+    GridBase                          *grid_;
+    double                            vol_;
+};
+
+template <typename FImpl, typename PhotonImpl>
+class TA2AAslashField: public Module<A2AAslashFieldPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef typename PhotonImpl::GaugeField EmField;
+    typedef A2AMatrixBlockComputation<Complex, 
+                                      FermionField, 
+                                      A2AAslashFieldMetadata, 
+                                      HADRONS_A2AM_IO_TYPE> Computation;
+    typedef AslashFieldKernel<Complex, FImpl> Kernel;
+public:
+    // constructor
+    TA2AAslashField(const std::string name);
+    // destructor
+    virtual ~TA2AAslashField(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(A2AAslashField, ARG(TA2AAslashField<FIMPL, PhotonR>), MContraction);
+
+/******************************************************************************
+ *                 TA2AAslashField implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+TA2AAslashField<FImpl, PhotonImpl>::TA2AAslashField(const std::string name)
+: Module<A2AAslashFieldPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getInput(void)
+{
+    std::vector<std::string> in = par().emField;
+    
+    in.push_back(par().left);
+    in.push_back(par().right);
+
+    return in;
+}
+
+template <typename FImpl, typename PhotonImpl>
+std::vector<std::string> TA2AAslashField<FImpl, PhotonImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+void TA2AAslashField<FImpl, PhotonImpl>::setup(void)
+{
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+           env().getNd() - 1, par().emField.size(), 1, par().block, 
+           par().cacheBlock, this);
+    envTmp(std::vector<ComplexField>, "B0", 1, 
+           par().emField.size(), envGetGrid(ComplexField));
+    envTmp(std::vector<ComplexField>, "B1", 1, 
+           par().emField.size(), envGetGrid(ComplexField));
+    envTmpLat(ComplexField, "Amu");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, typename PhotonImpl>
+void TA2AAslashField<FImpl, PhotonImpl>::execute(void)
+{
+    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &right = envGet(std::vector<FermionField>, par().right);
+
+    int nt         = env().getDim().back();
+    int N_i        = left.size();
+    int N_j        = right.size();
+    int nem        = par().emField.size();
+    int block      = par().block;
+    int cacheBlock = par().cacheBlock;
+
+    LOG(Message) << "Computing all-to-all A-slash fields" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
+    LOG(Message) << "EM fields:" << std::endl;
+    for (auto &name: par().emField)
+    {
+        LOG(Message) << "  " << name << std::endl;
+    }
+    LOG(Message) << "A-slash field size: " << nt << "*" << N_i << "*" << N_j 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
+                 << "/EM field)" << std::endl;
+    
+    // preparing "B" complexified fields
+    startTimer("Complexify EM fields");
+    envGetTmp(std::vector<ComplexField>, B0);
+    envGetTmp(std::vector<ComplexField>, B1);
+    for (unsigned int i = 0; i < par().emField.size(); ++i)
+    {
+        auto &A = envGet(EmField, par().emField[i]);
+        envGetTmp(ComplexField, Amu);
+
+        B0[i]  = peekLorentz(A, 0);
+        B0[i] += timesI(peekLorentz(A, 1));
+        B1[i]  = peekLorentz(A, 2);
+        B1[i] += timesI(peekLorentz(A, 3));
+    }
+    stopTimer("Complexify EM fields");
+
+    // I/O name & metadata lambdas
+    auto ionameFn = [this](const unsigned int em, const unsigned int dummy)
+    {
+        return par().emField[em];
+    };
+
+    auto filenameFn = [this, &ionameFn](const unsigned int em, const unsigned int dummy)
+    {
+        return par().output + "." + std::to_string(vm().getTrajectory()) 
+               + "/" + ionameFn(em, dummy) + ".h5";
+    };
+
+    auto metadataFn = [this](const unsigned int em, const unsigned int dummy)
+    {
+        A2AAslashFieldMetadata md;
+
+        md.emFieldName = par().emField[em];
+        
+        return md;
+    };
+
+    // executing computation
+    Kernel kernel(B0, B1, envGetGrid(FermionField));
+
+    envGetTmp(Computation, computation);
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_A2AAslashField_hpp_
diff --git a/Hadrons/Modules/MContraction/A2AMesonField.cc b/Hadrons/Modules/MContraction/A2AMesonField.cc
new file mode 100644
index 00000000..c71f8ef2
--- /dev/null
+++ b/Hadrons/Modules/MContraction/A2AMesonField.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/A2AMesonField.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TA2AMesonField<FIMPL>;
diff --git a/Hadrons/Modules/MContraction/A2AMesonField.hpp b/Hadrons/Modules/MContraction/A2AMesonField.hpp
new file mode 100644
index 00000000..4d35d1ef
--- /dev/null
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -0,0 +1,315 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/A2AMesonField.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MContraction_A2AMesonField_hpp_
+#define Hadrons_MContraction_A2AMesonField_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                     All-to-all meson field creation                        *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class A2AMesonFieldPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar,
+                                    int, cacheBlock,
+                                    int, block,
+                                    std::string, left,
+                                    std::string, right,
+                                    std::string, output,
+                                    std::string, gammas,
+                                    std::vector<std::string>, mom);
+};
+
+class A2AMesonFieldMetadata: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldMetadata,
+                                    std::vector<RealF>, momentum,
+                                    Gamma::Algebra, gamma);
+};
+
+template <typename T, typename FImpl>
+class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
+{
+public:
+    typedef typename FImpl::FermionField FermionField;
+public:
+    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
+                     const std::vector<LatticeComplex> &mom,
+                     GridBase *grid)
+    : gamma_(gamma), mom_(mom), grid_(grid)
+    {
+        vol_ = 1.;
+        for (auto &d: grid_->GlobalDimensions())
+        {
+            vol_ *= d;
+        }
+    }
+
+    virtual ~MesonFieldKernel(void) = default;
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
+    {
+        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
+    }
+
+    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
+    }
+
+    virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
+    {
+        return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
+               +  vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
+    }
+private:
+    const std::vector<Gamma::Algebra> &gamma_;
+    const std::vector<LatticeComplex> &mom_;
+    GridBase                          *grid_;
+    double                            vol_;
+};
+
+template <typename FImpl>
+class TA2AMesonField : public Module<A2AMesonFieldPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef A2AMatrixBlockComputation<Complex, 
+                                      FermionField, 
+                                      A2AMesonFieldMetadata, 
+                                      HADRONS_A2AM_IO_TYPE> Computation;
+    typedef MesonFieldKernel<Complex, FImpl> Kernel;
+public:
+    // constructor
+    TA2AMesonField(const std::string name);
+    // destructor
+    virtual ~TA2AMesonField(void){};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    bool                               hasPhase_{false};
+    std::string                        momphName_;
+    std::vector<Gamma::Algebra>        gamma_;
+    std::vector<std::vector<Real>>     mom_;
+};
+
+MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
+
+/******************************************************************************
+*                  TA2AMesonField implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TA2AMesonField<FImpl>::TA2AMesonField(const std::string name)
+: Module<A2AMesonFieldPar>(name)
+, momphName_(name + "_momph")
+{
+}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TA2AMesonField<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().left, par().right};
+
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TA2AMesonField<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AMesonField<FImpl>::setup(void)
+{
+    gamma_.clear();
+    mom_.clear();
+    if (par().gammas == "all")
+    {
+        gamma_ = {
+            Gamma::Algebra::Gamma5,
+            Gamma::Algebra::Identity,    
+            Gamma::Algebra::GammaX,
+            Gamma::Algebra::GammaY,
+            Gamma::Algebra::GammaZ,
+            Gamma::Algebra::GammaT,
+            Gamma::Algebra::GammaXGamma5,
+            Gamma::Algebra::GammaYGamma5,
+            Gamma::Algebra::GammaZGamma5,
+            Gamma::Algebra::GammaTGamma5,
+            Gamma::Algebra::SigmaXY,
+            Gamma::Algebra::SigmaXZ,
+            Gamma::Algebra::SigmaXT,
+            Gamma::Algebra::SigmaYZ,
+            Gamma::Algebra::SigmaYT,
+            Gamma::Algebra::SigmaZT
+        };
+    }
+    else
+    {
+        gamma_ = strToVec<Gamma::Algebra>(par().gammas);
+    }
+    for (auto &pstr: par().mom)
+    {
+        auto p = strToVec<Real>(pstr);
+
+        if (p.size() != env().getNd() - 1)
+        {
+            HADRONS_ERROR(Size, "Momentum has " + std::to_string(p.size())
+                                + " components instead of " 
+                                + std::to_string(env().getNd() - 1));
+        }
+        mom_.push_back(p);
+    }
+    envCache(std::vector<ComplexField>, momphName_, 1, 
+             par().mom.size(), envGetGrid(ComplexField));
+    envTmpLat(ComplexField, "coor");
+    envTmp(Computation, "computation", 1, envGetGrid(FermionField), 
+           env().getNd() - 1, mom_.size(), gamma_.size(), par().block, 
+           par().cacheBlock, this);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AMesonField<FImpl>::execute(void)
+{
+    auto &left  = envGet(std::vector<FermionField>, par().left);
+    auto &right = envGet(std::vector<FermionField>, par().right);
+
+    int nt         = env().getDim().back();
+    int N_i        = left.size();
+    int N_j        = right.size();
+    int ngamma     = gamma_.size();
+    int nmom       = mom_.size();
+    int block      = par().block;
+    int cacheBlock = par().cacheBlock;
+
+    LOG(Message) << "Computing all-to-all meson fields" << std::endl;
+    LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl;
+    LOG(Message) << "Momenta:" << std::endl;
+    for (auto &p: mom_)
+    {
+        LOG(Message) << "  " << p << std::endl;
+    }
+    LOG(Message) << "Spin bilinears:" << std::endl;
+    for (auto &g: gamma_)
+    {
+        LOG(Message) << "  " << g << std::endl;
+    }
+    LOG(Message) << "Meson field size: " << nt << "*" << N_i << "*" << N_j 
+                 << " (filesize " << sizeString(nt*N_i*N_j*sizeof(HADRONS_A2AM_IO_TYPE)) 
+                 << "/momentum/bilinear)" << std::endl;
+
+    auto &ph = envGet(std::vector<ComplexField>, momphName_);
+
+    if (!hasPhase_)
+    {
+        startTimer("Momentum phases");
+        for (unsigned int j = 0; j < nmom; ++j)
+        {
+            Complex           i(0.0,1.0);
+            std::vector<Real> p;
+
+            envGetTmp(ComplexField, coor);
+            ph[j] = Zero();
+            for(unsigned int mu = 0; mu < mom_[j].size(); mu++)
+            {
+                LatticeCoordinate(coor, mu);
+                ph[j] = ph[j] + (mom_[j][mu]/env().getDim(mu))*coor;
+            }
+            ph[j] = exp((Real)(2*M_PI)*i*ph[j]);
+        }
+        hasPhase_ = true;
+        stopTimer("Momentum phases");
+    }
+
+    auto ionameFn = [this](const unsigned int m, const unsigned int g)
+    {
+        std::stringstream ss;
+
+        ss << gamma_[g] << "_";
+        for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
+        {
+            ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
+        }
+
+        return ss.str();
+    };
+
+    auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
+    {
+        return par().output + "." + std::to_string(vm().getTrajectory()) 
+               + "/" + ionameFn(m, g) + ".h5";
+    };
+
+    auto metadataFn = [this](const unsigned int m, const unsigned int g)
+    {
+        A2AMesonFieldMetadata md;
+
+        for (auto pmu: mom_[m])
+        {
+            md.momentum.push_back(pmu);
+        }
+        md.gamma = gamma_[g];
+        
+        return md;
+    };
+
+    Kernel      kernel(gamma_, ph, envGetGrid(FermionField));
+
+    envGetTmp(Computation, computation);
+    computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_A2AMesonField_hpp_
diff --git a/Hadrons/Modules/MContraction/Baryon.cc b/Hadrons/Modules/MContraction/Baryon.cc
new file mode 100644
index 00000000..f597a858
--- /dev/null
+++ b/Hadrons/Modules/MContraction/Baryon.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/Baryon.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MContraction/Baryon.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TBaryon<FIMPL,FIMPL,FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MContraction/Baryon.hpp b/Hadrons/Modules/MContraction/Baryon.hpp
similarity index 91%
rename from extras/Hadrons/Modules/MContraction/Baryon.hpp
rename to Hadrons/Modules/MContraction/Baryon.hpp
index 1ef2e257..01db09e0 100644
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/Hadrons/Modules/MContraction/Baryon.hpp
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/Baryon.hpp
+Source file: Hadrons/Modules/MContraction/Baryon.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_Baryon_hpp_
 #define Hadrons_MContraction_Baryon_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -68,7 +68,7 @@ public:
     // constructor
     TBaryon(const std::string name);
     // destructor
-    virtual ~TBaryon(void) = default;
+    virtual ~TBaryon(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -79,7 +79,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Baryon, ARG(TBaryon<FIMPL, FIMPL, FIMPL>), MContraction);
+MODULE_REGISTER_TMP(Baryon, ARG(TBaryon<FIMPL, FIMPL, FIMPL>), MContraction);
 
 /******************************************************************************
  *                         TBaryon implementation                             *
@@ -122,7 +122,6 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
                  << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
                  << par().q3 << "'" << std::endl;
     
-    CorrWriter writer(par().output);
     auto       &q1 = envGet(PropagatorField1, par().q1);
     auto       &q2 = envGet(PropagatorField2, par().q2);
     auto       &q3 = envGet(PropagatorField3, par().q2);
@@ -131,7 +130,7 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
     
     // FIXME: do contractions
     
-    // write(writer, "meson", result);
+    // saveResult(par().output, "meson", result);
 }
 
 END_MODULE_NAMESPACE
diff --git a/Hadrons/Modules/MContraction/DiscLoop.cc b/Hadrons/Modules/MContraction/DiscLoop.cc
new file mode 100644
index 00000000..fa4cd235
--- /dev/null
+++ b/Hadrons/Modules/MContraction/DiscLoop.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/DiscLoop.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MContraction/DiscLoop.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TDiscLoop<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp b/Hadrons/Modules/MContraction/DiscLoop.hpp
similarity index 90%
rename from extras/Hadrons/Modules/MContraction/DiscLoop.hpp
rename to Hadrons/Modules/MContraction/DiscLoop.hpp
index ef50061c..f8b88eb2 100644
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/Hadrons/Modules/MContraction/DiscLoop.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+Source file: Hadrons/Modules/MContraction/DiscLoop.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_DiscLoop_hpp_
 #define Hadrons_MContraction_DiscLoop_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -64,7 +65,7 @@ public:
     // constructor
     TDiscLoop(const std::string name);
     // destructor
-    virtual ~TDiscLoop(void) = default;
+    virtual ~TDiscLoop(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -75,7 +76,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(DiscLoop, TDiscLoop<FIMPL>, MContraction);
+MODULE_REGISTER_TMP(DiscLoop, TDiscLoop<FIMPL>, MContraction);
 
 /******************************************************************************
  *                       TDiscLoop implementation                             *
@@ -118,7 +119,6 @@ void TDiscLoop<FImpl>::execute(void)
                  << "' using '" << par().q_loop << "' with " << par().gamma 
                  << " insertion." << std::endl;
 
-    CorrWriter            writer(par().output);
     auto                  &q_loop = envGet(PropagatorField, par().q_loop);
     Gamma                 gamma(par().gamma);
     std::vector<TComplex> buf;
@@ -127,15 +127,13 @@ void TDiscLoop<FImpl>::execute(void)
     envGetTmp(LatticeComplex, c);
     c = trace(gamma*q_loop);
     sliceSum(c, buf, Tp);
-
     result.gamma = par().gamma;
     result.corr.resize(buf.size());
     for (unsigned int t = 0; t < buf.size(); ++t)
     {
         result.corr[t] = TensorRemove(buf[t]);
     }
-
-    write(writer, "disc", result);
+    saveResult(par().output, "disc", result);
 }
 
 END_MODULE_NAMESPACE
diff --git a/extras/Hadrons/Exceptions.cc b/Hadrons/Modules/MContraction/Gamma3pt.cc
similarity index 54%
rename from extras/Hadrons/Exceptions.cc
rename to Hadrons/Modules/MContraction/Gamma3pt.cc
index b251939f..ca1ca441 100644
--- a/extras/Hadrons/Exceptions.cc
+++ b/Hadrons/Modules/MContraction/Gamma3pt.cc
@@ -2,9 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Exceptions.cc
+Source file: Hadrons/Modules/MContraction/Gamma3pt.cc
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -25,33 +25,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
-#include <Grid/Hadrons/Exceptions.hpp>
-
-#ifndef ERR_SUFF
-#define ERR_SUFF " (" + loc + ")"
-#endif
-
-#define CONST_EXC(name, init) \
-name::name(std::string msg, std::string loc)\
-:init\
-{}
+#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
-using namespace Exceptions;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TGamma3pt<FIMPL,FIMPL,FIMPL>;
 
-// logic errors
-CONST_EXC(Logic, logic_error(msg + ERR_SUFF))
-CONST_EXC(Definition, Logic("definition error: " + msg, loc))
-CONST_EXC(Implementation, Logic("implementation error: " + msg, loc))
-CONST_EXC(Range, Logic("range error: " + msg, loc))
-CONST_EXC(Size, Logic("size error: " + msg, loc))
-// runtime errors
-CONST_EXC(Runtime, runtime_error(msg + ERR_SUFF))
-CONST_EXC(Argument, Runtime("argument error: " + msg, loc))
-CONST_EXC(Io, Runtime("IO error: " + msg, loc))
-CONST_EXC(Memory, Runtime("memory error: " + msg, loc))
-CONST_EXC(Parsing, Runtime("parsing error: " + msg, loc))
-CONST_EXC(Program, Runtime("program error: " + msg, loc))
-CONST_EXC(System, Runtime("system error: " + msg, loc))
diff --git a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp b/Hadrons/Modules/MContraction/Gamma3pt.hpp
similarity index 92%
rename from extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
rename to Hadrons/Modules/MContraction/Gamma3pt.hpp
index fb9a9d4b..2a0da43a 100644
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+Source file: Hadrons/Modules/MContraction/Gamma3pt.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_Gamma3pt_hpp_
 #define Hadrons_MContraction_Gamma3pt_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -95,7 +96,7 @@ public:
     // constructor
     TGamma3pt(const std::string name);
     // destructor
-    virtual ~TGamma3pt(void) = default;
+    virtual ~TGamma3pt(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -106,7 +107,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Gamma3pt, ARG(TGamma3pt<FIMPL, FIMPL, FIMPL>), MContraction);
+MODULE_REGISTER_TMP(Gamma3pt, ARG(TGamma3pt<FIMPL, FIMPL, FIMPL>), MContraction);
 
 /******************************************************************************
  *                       TGamma3pt implementation                             *
@@ -152,7 +153,6 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
 
     // Initialise variables. q2 and q3 are normal propagators, q1 may be 
     // sink smeared.
-    CorrWriter            writer(par().output);
     auto                  &q1 = envGet(SlicedPropagator1, par().q1);
     auto                  &q2 = envGet(PropagatorField2, par().q2);
     auto                  &q3 = envGet(PropagatorField2, par().q3);
@@ -174,8 +174,7 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
     {
         result.corr[t] = TensorRemove(buf[t]);
     }
-
-    write(writer, "gamma3pt", result);
+    saveResult(par().output, "gamma3pt", result);
 }
 
 END_MODULE_NAMESPACE
diff --git a/Hadrons/Modules/MContraction/Meson.cc b/Hadrons/Modules/MContraction/Meson.cc
new file mode 100644
index 00000000..7fac7e95
--- /dev/null
+++ b/Hadrons/Modules/MContraction/Meson.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/Meson.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MContraction/Meson.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TMeson<FIMPL,FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MContraction/Meson.hpp b/Hadrons/Modules/MContraction/Meson.hpp
similarity index 92%
rename from extras/Hadrons/Modules/MContraction/Meson.hpp
rename to Hadrons/Modules/MContraction/Meson.hpp
index 46bbdb2e..514a6bb9 100644
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/Hadrons/Modules/MContraction/Meson.hpp
@@ -2,14 +2,13 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/Meson.hpp
+Source file: Hadrons/Modules/MContraction/Meson.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
-        Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
+Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -32,9 +31,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_Meson_hpp_
 #define Hadrons_MContraction_Meson_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -47,8 +46,8 @@ BEGIN_HADRONS_NAMESPACE
  - q1: input propagator 1 (string)
  - q2: input propagator 2 (string)
  - gammas: gamma products to insert at sink & source, pairs of gamma matrices 
-           (space-separated strings) in angled brackets (i.e. <g_sink g_src>),
-           in a sequence (e.g. "<Gamma5 Gamma5><Gamma5 GammaT>").
+           (space-separated strings) in round brackets (i.e. (g_sink g_src)),
+           in a sequence (e.g. "(Gamma5 Gamma5)(Gamma5 GammaT)").
 
            Special values: "all" - perform all possible contractions.
  - sink: module to compute the sink to use in contraction (string).
@@ -78,7 +77,7 @@ class TMeson: public Module<MesonPar>
 public:
     FERM_TYPE_ALIASES(FImpl1, 1);
     FERM_TYPE_ALIASES(FImpl2, 2);
-    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
     SINK_TYPE_ALIASES(Scalar);
     class Result: Serializable
     {
@@ -92,7 +91,7 @@ public:
     // constructor
     TMeson(const std::string name);
     // destructor
-    virtual ~TMeson(void) = default;
+    virtual ~TMeson(void) {};
     // dependencies/products
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -104,7 +103,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Meson, ARG(TMeson<FIMPL, FIMPL>), MContraction);
+MODULE_REGISTER_TMP(Meson, ARG(TMeson<FIMPL, FIMPL>), MContraction);
 
 /******************************************************************************
  *                           TMeson implementation                            *
@@ -174,7 +173,6 @@ void TMeson<FImpl1, FImpl2>::execute(void)
                  << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
                  << std::endl;
     
-    CorrWriter             writer(par().output);
     std::vector<TComplex>  buf;
     std::vector<Result>    result;
     Gamma                  g5(Gamma::Algebra::Gamma5);
@@ -241,7 +239,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
             }
         }
     }
-    write(writer, "meson", result);
+    saveResult(par().output, "meson", result);
 }
 
 END_MODULE_NAMESPACE
diff --git a/Hadrons/Modules/MContraction/WardIdentity.cc b/Hadrons/Modules/MContraction/WardIdentity.cc
new file mode 100644
index 00000000..98c06d12
--- /dev/null
+++ b/Hadrons/Modules/MContraction/WardIdentity.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MContraction/WardIdentity.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MContraction/WardIdentity.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TWardIdentity<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MContraction/WardIdentity.hpp b/Hadrons/Modules/MContraction/WardIdentity.hpp
similarity index 93%
rename from extras/Hadrons/Modules/MContraction/WardIdentity.hpp
rename to Hadrons/Modules/MContraction/WardIdentity.hpp
index 9f04a0e9..8f27900e 100644
--- a/extras/Hadrons/Modules/MContraction/WardIdentity.hpp
+++ b/Hadrons/Modules/MContraction/WardIdentity.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WardIdentity.hpp
+Source file: Hadrons/Modules/MContraction/WardIdentity.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_WardIdentity_hpp_
 #define Hadrons_MContraction_WardIdentity_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -70,7 +71,7 @@ public:
     // constructor
     TWardIdentity(const std::string name);
     // destructor
-    virtual ~TWardIdentity(void) = default;
+    virtual ~TWardIdentity(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -83,7 +84,7 @@ private:
     unsigned int Ls_;
 };
 
-MODULE_REGISTER_NS(WardIdentity, TWardIdentity<FIMPL>, MContraction);
+MODULE_REGISTER_TMP(WardIdentity, TWardIdentity<FIMPL>, MContraction);
 
 /******************************************************************************
  *                     TWardIdentity implementation                           *
@@ -118,7 +119,7 @@ void TWardIdentity<FImpl>::setup(void)
     Ls_ = env().getObjectLs(par().q);
     if (Ls_ != env().getObjectLs(par().action))
     {
-        HADRON_ERROR(Size, "Ls mismatch between quark action and propagator");
+        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
     }
     envTmpLat(PropagatorField, "tmp");
     envTmpLat(PropagatorField, "vector_WI");
@@ -174,7 +175,7 @@ void TWardIdentity<FImpl>::execute(void)
             axial_defect += trace(g5*tmp);
         }
 
-        // Get <P|J5q> for 5D (Zero() for 4D) and <P|P>.
+        // Get <P|J5q> for 5D (Zero(); for 4D) and <P|P>.
         PJ5q = Zero();
         if (Ls_ > 1)
         {
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp b/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
similarity index 90%
rename from extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
rename to Hadrons/Modules/MContraction/WeakHamiltonian.hpp
index 15f1a009..824dc780 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+Source file: Hadrons/Modules/MContraction/WeakHamiltonian.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
 #define Hadrons_MContraction_WeakHamiltonian_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -96,7 +97,7 @@ public:\
     /* constructor */ \
     T##modname(const std::string name);\
     /* destructor */ \
-    virtual ~T##modname(void) = default;\
+    virtual ~T##modname(void) {};\
     /* dependency relation */ \
     virtual std::vector<std::string> getInput(void);\
     virtual std::vector<std::string> getOutput(void);\
@@ -108,7 +109,7 @@ protected:\
     /* execution */ \
     virtual void execute(void);\
 };\
-MODULE_REGISTER_NS(modname, T##modname, MContraction);
+MODULE_REGISTER(modname, T##modname, MContraction);
 
 END_MODULE_NAMESPACE
 
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc b/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
similarity index 95%
rename from extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
rename to Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
index 43dfa609..2183d992 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
+++ b/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
+Source file: Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -26,7 +27,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -103,7 +104,6 @@ void TWeakHamiltonianEye::execute(void)
                  << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                  << "'." << std::endl;
 
-    CorrWriter             writer(par().output);
     auto                   &q1 = envGet(SlicedPropagator, par().q1);
     auto                   &q2 = envGet(PropagatorField, par().q2);
     auto                   &q3 = envGet(PropagatorField, par().q3);
@@ -146,5 +146,6 @@ void TWeakHamiltonianEye::execute(void)
     SUM_MU(expbuf, E_body[mu]*E_loop[mu])
     MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
 
-    write(writer, "HW_Eye", result);
+    // IO
+    saveResult(par().output, "HW_Eye", result);
 }
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp b/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
similarity index 88%
rename from extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
rename to Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
index 3a2b9309..37d37022 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+++ b/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
+Source file: Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,7 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
 #define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc b/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
similarity index 95%
rename from extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
rename to Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
index 8a7113e3..5c1d8a1c 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
+++ b/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
+Source file: Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -26,7 +27,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -103,7 +104,6 @@ void TWeakHamiltonianNonEye::execute(void)
                  << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                  << "'." << std::endl;
     
-    CorrWriter            writer(par().output);
     auto                  &q1 = envGet(PropagatorField, par().q1);
     auto                  &q2 = envGet(PropagatorField, par().q2);
     auto                  &q3 = envGet(PropagatorField, par().q3);
@@ -143,5 +143,6 @@ void TWeakHamiltonianNonEye::execute(void)
     SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
     MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
 
-    write(writer, "HW_NonEye", result);
+    // IO
+    saveResult(par().output, "HW_NonEye", result);
 }
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp b/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
similarity index 87%
rename from extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
rename to Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
index eb5abe3c..19c0ebaa 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+++ b/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
+Source file: Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,7 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 #define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
diff --git a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc b/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
similarity index 94%
rename from extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
rename to Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
index 18423f3e..2273dd3d 100644
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
+++ b/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
+Source file: Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -26,7 +27,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
+#include <Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -103,7 +104,6 @@ void TWeakNeutral4ptDisc::execute(void)
                  << par().q2 << ", '" << par().q3 << "' and '" << par().q4 
                  << "'." << std::endl;
 
-    CorrWriter            writer(par().output);
     auto                  &q1 = envGet(PropagatorField, par().q1);
     auto                  &q2 = envGet(PropagatorField, par().q2);
     auto                  &q3 = envGet(PropagatorField, par().q3);
@@ -137,5 +137,6 @@ void TWeakNeutral4ptDisc::execute(void)
     expbuf *= curr;
     MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
 
-    write(writer, "HW_disc0", result);
+    // IO
+    saveResult(par().output, "HW_disc0", result);
 }
diff --git a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp b/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
similarity index 88%
rename from extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
rename to Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
index f26d4636..3e20633f 100644
--- a/extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+++ b/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
+Source file: Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,7 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 #define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
+#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
diff --git a/Hadrons/Modules/MFermion/FreeProp.cc b/Hadrons/Modules/MFermion/FreeProp.cc
new file mode 100644
index 00000000..1256f19c
--- /dev/null
+++ b/Hadrons/Modules/MFermion/FreeProp.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MFermion/FreeProp.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <vmg1n14@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MFermion/FreeProp.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MFermion;
+
+template class Grid::Hadrons::MFermion::TFreeProp<FIMPL>;
+
diff --git a/Hadrons/Modules/MFermion/FreeProp.hpp b/Hadrons/Modules/MFermion/FreeProp.hpp
new file mode 100644
index 00000000..eb7971f5
--- /dev/null
+++ b/Hadrons/Modules/MFermion/FreeProp.hpp
@@ -0,0 +1,188 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MFermion/FreeProp.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <vmg1n14@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+
+#ifndef Hadrons_MFermion_FreeProp_hpp_
+#define Hadrons_MFermion_FreeProp_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         FreeProp                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MFermion)
+
+class FreePropPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
+                                    std::string, source,
+				    std::string,  action,
+				    double, mass,
+				    std::string,  twist);
+};
+
+template <typename FImpl>
+class TFreeProp: public Module<FreePropPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TFreeProp(const std::string name);
+    // destructor
+    virtual ~TFreeProp(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    unsigned int Ls_;
+};
+
+MODULE_REGISTER_TMP(FreeProp, TFreeProp<FIMPL>, MFermion);
+
+/******************************************************************************
+ *                 TFreeProp implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TFreeProp<FImpl>::TFreeProp(const std::string name)
+: Module<FreePropPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TFreeProp<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().source, par().action};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TFreeProp<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName(), getName() + "_5d"};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFreeProp<FImpl>::setup(void)
+{
+    Ls_ = env().getObjectLs(par().action);
+    envCreateLat(PropagatorField, getName());
+    envTmpLat(FermionField, "source", Ls_);
+    envTmpLat(FermionField, "sol", Ls_);
+    envTmpLat(FermionField, "tmp");
+    if (Ls_ > 1)
+    {
+        envCreateLat(PropagatorField, getName() + "_5d", Ls_);
+    }    
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFreeProp<FImpl>::execute(void)
+{
+    LOG(Message) << "Computing free fermion propagator '" << getName() << "'"
+                 << std::endl;
+    
+    std::string propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
+    auto        &prop    = envGet(PropagatorField, propName);
+    auto        &fullSrc = envGet(PropagatorField, par().source);
+    auto        &mat = envGet(FMat, par().action);
+    RealD mass = par().mass;
+    
+    envGetTmp(FermionField, source);
+    envGetTmp(FermionField, sol);
+    envGetTmp(FermionField, tmp);
+    LOG(Message) << "Calculating a free Propagator with mass " << mass 
+		 << " using the action '" << par().action
+                 << "' on source '" << par().source << "'" << std::endl;
+    for (unsigned int s = 0; s < Ns; ++s)
+      for (unsigned int c = 0; c < FImpl::Dimension; ++c)
+    {
+        LOG(Message) << "Calculation for spin= " << s << ", color= " << c
+                     << std::endl;
+        // source conversion for 4D sources
+        if (!env().isObject5d(par().source))
+        {
+            if (Ls_ == 1)
+            {
+               PropToFerm<FImpl>(source, fullSrc, s, c);
+            }
+            else
+            {
+                PropToFerm<FImpl>(tmp, fullSrc, s, c);
+                mat.ImportPhysicalFermionSource(tmp, source);
+            }
+        }
+        // source conversion for 5D sources
+        else
+        {
+            if (Ls_ != env().getObjectLs(par().source))
+            {
+                HADRONS_ERROR(Size, "Ls mismatch between quark action and source");
+            }
+            else
+            {
+                PropToFerm<FImpl>(source, fullSrc, s, c);
+            }
+        }
+        sol = Zero();
+	std::vector<double> twist = strToVec<double>(par().twist);
+	if(twist.size() != Nd) HADRONS_ERROR(Size, "number of twist angles does not match number of dimensions");
+	mat.FreePropagator(source,sol,mass,twist);
+        FermToProp<FImpl>(prop, sol, s, c);
+        // create 4D propagators from 5D one if necessary
+        if (Ls_ > 1)
+        {
+            PropagatorField &p4d = envGet(PropagatorField, getName());
+            mat.ExportPhysicalFermionSolution(sol, tmp);
+            FermToProp<FImpl>(p4d, tmp, s, c);
+        }
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MFermion_FreeProp_hpp_
diff --git a/Hadrons/Modules/MFermion/GaugeProp.cc b/Hadrons/Modules/MFermion/GaugeProp.cc
new file mode 100644
index 00000000..f3881e0f
--- /dev/null
+++ b/Hadrons/Modules/MFermion/GaugeProp.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MFermion/GaugeProp.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MFermion/GaugeProp.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MFermion;
+
+template class Grid::Hadrons::MFermion::TGaugeProp<FIMPL>;
+template class Grid::Hadrons::MFermion::TGaugeProp<ZFIMPL>;
diff --git a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp b/Hadrons/Modules/MFermion/GaugeProp.hpp
similarity index 73%
rename from extras/Hadrons/Modules/MFermion/GaugeProp.hpp
rename to Hadrons/Modules/MFermion/GaugeProp.hpp
index 3316a82f..90554607 100644
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -2,14 +2,14 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+Source file: Hadrons/Modules/MFermion/GaugeProp.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
-        Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Lanny91 <andrew.lawson@gmail.com>
+Author: pretidav <david.preti@csic.es>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -32,33 +32,13 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MFermion_GaugeProp_hpp_
 #define Hadrons_MFermion_GaugeProp_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
-/******************************************************************************
- * 5D -> 4D and 4D -> 5D conversions.                                         *
- ******************************************************************************/
-template<class vobj> // Note that 5D object is modified.
-inline void make_4D(Lattice<vobj> &in_5d, Lattice<vobj> &out_4d, int Ls)
-{
-    axpby_ssp_pminus(in_5d, 0., in_5d, 1., in_5d, 0, 0);
-    axpby_ssp_pplus(in_5d, 1., in_5d, 1., in_5d, 0, Ls-1);
-    ExtractSlice(out_4d, in_5d, 0, 0);
-}
-
-template<class vobj>
-inline void make_5D(Lattice<vobj> &in_4d, Lattice<vobj> &out_5d, int Ls)
-{
-    out_5d = Zero();
-    InsertSlice(in_4d, out_5d, 0, 0);
-    InsertSlice(in_4d, out_5d, Ls-1, 0);
-    axpby_ssp_pplus(out_5d, 0., out_5d, 1., out_5d, 0, 0);
-    axpby_ssp_pminus(out_5d, 0., out_5d, 1., out_5d, Ls-1, Ls-1);
-}
-
 /******************************************************************************
  *                                GaugeProp                                   *
  ******************************************************************************/
@@ -76,12 +56,13 @@ template <typename FImpl>
 class TGaugeProp: public Module<GaugePropPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TGaugeProp(const std::string name);
     // destructor
-    virtual ~TGaugeProp(void) = default;
+    virtual ~TGaugeProp(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -92,10 +73,11 @@ protected:
     virtual void execute(void);
 private:
     unsigned int Ls_;
-    SolverFn     *solver_{nullptr};
+    Solver       *solver_{nullptr};
 };
 
-MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
+MODULE_REGISTER_TMP(GaugeProp, TGaugeProp<FIMPL>, MFermion);
+MODULE_REGISTER_TMP(ZGaugeProp, TGaugeProp<ZFIMPL>, MFermion);
 
 /******************************************************************************
  *                      TGaugeProp implementation                             *
@@ -148,7 +130,8 @@ void TGaugeProp<FImpl>::execute(void)
     std::string propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
     auto        &prop    = envGet(PropagatorField, propName);
     auto        &fullSrc = envGet(PropagatorField, par().source);
-    auto        &solver  = envGet(SolverFn, par().solver);
+    auto        &solver  = envGet(Solver, par().solver);
+    auto        &mat     = solver.getFMat();
     
     envGetTmp(FermionField, source);
     envGetTmp(FermionField, sol);
@@ -156,21 +139,22 @@ void TGaugeProp<FImpl>::execute(void)
     LOG(Message) << "Inverting using solver '" << par().solver
                  << "' on source '" << par().source << "'" << std::endl;
     for (unsigned int s = 0; s < Ns; ++s)
-    for (unsigned int c = 0; c < Nc; ++c)
+    for (unsigned int c = 0; c < FImpl::Dimension; ++c)
     {
         LOG(Message) << "Inversion for spin= " << s << ", color= " << c
                      << std::endl;
         // source conversion for 4D sources
+        LOG(Message) << "Import source" << std::endl;
         if (!env().isObject5d(par().source))
         {
             if (Ls_ == 1)
             {
-                PropToFerm(source, fullSrc, s, c);
+               PropToFerm<FImpl>(source, fullSrc, s, c);
             }
             else
             {
-                PropToFerm(tmp, fullSrc, s, c);
-                make_5D(tmp, source, Ls_);
+                PropToFerm<FImpl>(tmp, fullSrc, s, c);
+                mat.ImportPhysicalFermionSource(tmp, source);
             }
         }
         // source conversion for 5D sources
@@ -178,22 +162,24 @@ void TGaugeProp<FImpl>::execute(void)
         {
             if (Ls_ != env().getObjectLs(par().source))
             {
-                HADRON_ERROR(Size, "Ls mismatch between quark action and source");
+                HADRONS_ERROR(Size, "Ls mismatch between quark action and source");
             }
             else
             {
-                PropToFerm(source, fullSrc, s, c);
+                PropToFerm<FImpl>(source, fullSrc, s, c);
             }
         }
         sol = Zero();
+        LOG(Message) << "Solve" << std::endl;
         solver(sol, source);
-        FermToProp(prop, sol, s, c);
+        LOG(Message) << "Export solution" << std::endl;
+        FermToProp<FImpl>(prop, sol, s, c);
         // create 4D propagators from 5D one if necessary
         if (Ls_ > 1)
         {
             PropagatorField &p4d = envGet(PropagatorField, getName());
-            make_4D(sol, tmp, Ls_);
-            FermToProp(p4d, tmp, s, c);
+            mat.ExportPhysicalFermionSolution(sol, tmp);
+            FermToProp<FImpl>(p4d, tmp, s, c);
         }
     }
 }
diff --git a/Hadrons/Modules/MGauge/Electrify.cc b/Hadrons/Modules/MGauge/Electrify.cc
new file mode 100644
index 00000000..1feea9ec
--- /dev/null
+++ b/Hadrons/Modules/MGauge/Electrify.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Electrify.cc
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TElectrify<GIMPL>;
diff --git a/Hadrons/Modules/MGauge/Electrify.hpp b/Hadrons/Modules/MGauge/Electrify.hpp
new file mode 100644
index 00000000..58d65eba
--- /dev/null
+++ b/Hadrons/Modules/MGauge/Electrify.hpp
@@ -0,0 +1,151 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Electrify.hpp
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGauge_Electrify_hpp_
+#define Hadrons_MGauge_Electrify_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                              Electrify gauge                               *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+/****************************************************************************
+*  Electrify a gauge field:
+*
+*  Ue_mu(x) = U_mu(x)*exp(ieqA_mu(x))
+*
+*  with
+*
+*  - gauge: U_mu(x): gauge field
+*  - emField: A_mu(x): electromagnetic photon field
+*  - e: value for the elementary charge
+*  - q: charge in units of e
+*
+*****************************************************************************/
+
+
+class ElectrifyPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ElectrifyPar,
+                                    std::string, gauge,
+				    std::string, emField,
+				    double, e,
+				    double, charge);
+};
+
+template <typename GImpl>
+class TElectrify: public Module<ElectrifyPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    typedef PhotonR::GaugeField     EmField;
+public:
+    // constructor
+    TElectrify(const std::string name);
+    // destructor
+    virtual ~TElectrify(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Electrify, TElectrify<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TElectrify implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TElectrify<GImpl>::TElectrify(const std::string name)
+: Module<ElectrifyPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TElectrify<GImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge, par().emField};
+
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TElectrify<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TElectrify<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+    envTmpLat(LatticeComplex, "eiAmu");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TElectrify<GImpl>::execute(void)
+{
+    LOG(Message) << "Electrify the gauge field " << par().gauge << " using the photon field " 
+                  << par().emField << " with charge e*q= " << par().e << "*" << par().charge << std::endl;
+    
+    auto &Ue = envGet(GaugeField, getName());
+    auto &U = envGet(GaugeField, par().gauge);
+    auto &A = envGet(EmField,  par().emField);
+    envGetTmp(LatticeComplex, eiAmu);
+
+    Complex i(0.0,1.0);
+
+    for(unsigned int mu = 0; mu < env().getNd(); mu++)
+    {
+	eiAmu = exp(i * (Real)(par().e * par().charge) * PeekIndex<LorentzIndex>(A, mu));
+	PokeIndex<LorentzIndex>(Ue, PeekIndex<LorentzIndex>(U, mu) * eiAmu, mu);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGauge_Electrify_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/Random.cc b/Hadrons/Modules/MGauge/FundtoHirep.cc
similarity index 60%
rename from extras/Hadrons/Modules/MGauge/Random.cc
rename to Hadrons/Modules/MGauge/FundtoHirep.cc
index 97afd338..11b4aed5 100644
--- a/extras/Hadrons/Modules/MGauge/Random.cc
+++ b/Hadrons/Modules/MGauge/FundtoHirep.cc
@@ -2,12 +2,13 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/Random.cc
+Source file: Hadrons/Modules/MGauge/FundtoHirep.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: pretidav <david.preti@csic.es>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -27,46 +28,52 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
+#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 
-/******************************************************************************
-*                           TRandom implementation                            *
-******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-TRandom::TRandom(const std::string name)
-: Module<NoPar>(name)
+template <class Rep>
+TFundtoHirep<Rep>::TFundtoHirep(const std::string name)
+: Module<FundtoHirepPar>(name)
 {}
 
 // dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TRandom::getInput(void)
+template <class Rep>
+std::vector<std::string> TFundtoHirep<Rep>::getInput(void)
 {
-    std::vector<std::string> in;
-    
+    std::vector<std::string> in = {par().gaugeconf};
+
     return in;
 }
 
-std::vector<std::string> TRandom::getOutput(void)
+template <class Rep>
+std::vector<std::string> TFundtoHirep<Rep>::getOutput(void)
 {
     std::vector<std::string> out = {getName()};
-    
+
     return out;
 }
 
 // setup ///////////////////////////////////////////////////////////////////////
-void TRandom::setup(void)
+template <typename Rep>
+void TFundtoHirep<Rep>::setup(void)
 {
-    envCreateLat(LatticeGaugeField, getName());
+    envCreateLat(Rep::LatticeField, getName());
 }
 
 // execution ///////////////////////////////////////////////////////////////////
-void TRandom::execute(void)
+template <class Rep>
+void TFundtoHirep<Rep>::execute(void)
 {
-    LOG(Message) << "Generating random gauge configuration" << std::endl;
-    
-    auto &U = envGet(LatticeGaugeField, getName());
-    SU3::HotConfiguration(*env().get4dRng(), U);
+    LOG(Message) << "Transforming Representation" << std::endl;
+
+    auto &U    = envGet(LatticeGaugeField, par().gaugeconf);
+    auto &URep = envGet(Rep::LatticeField, getName());
+
+    Rep TargetRepresentation(U._grid);
+    TargetRepresentation.update_representation(U);
+    URep = TargetRepresentation.U;
 }
diff --git a/extras/Hadrons/Modules/MGauge/Unit.hpp b/Hadrons/Modules/MGauge/FundtoHirep.hpp
similarity index 58%
rename from extras/Hadrons/Modules/MGauge/Unit.hpp
rename to Hadrons/Modules/MGauge/FundtoHirep.hpp
index c1650cc7..893383d2 100644
--- a/extras/Hadrons/Modules/MGauge/Unit.hpp
+++ b/Hadrons/Modules/MGauge/FundtoHirep.hpp
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/Unit.hpp
+Source file: Hadrons/Modules/MGauge/FundtoHirep.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: pretidav <david.preti@csic.es>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -27,41 +27,50 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#ifndef Hadrons_MGauge_Unit_hpp_
-#define Hadrons_MGauge_Unit_hpp_
+#ifndef Hadrons_MGauge_FundtoHirep_hpp_
+#define Hadrons_MGauge_FundtoHirep_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                              Unit gauge                                    *
+ *                         Load a NERSC configuration                         *
  ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 
-class TUnit: public Module<NoPar>
+class FundtoHirepPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FundtoHirepPar,
+                                    std::string, gaugeconf);
+};
+
+template <class Rep>
+class TFundtoHirep: public Module<FundtoHirepPar>
 {
 public:
     // constructor
-    TUnit(const std::string name);
+    TFundtoHirep(const std::string name);
     // destructor
-    virtual ~TUnit(void) = default;
-    // dependencies/products
+    virtual ~TFundtoHirep(void) {};
+    // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
-protected:
     // setup
-    virtual void setup(void);
+    void setup(void);
     // execution
-    virtual void execute(void);
+    void execute(void);
 };
 
-MODULE_REGISTER_NS(Unit, TUnit, MGauge);
+//MODULE_REGISTER_TMP(FundtoAdjoint,   TFundtoHirep<AdjointRepresentation>, MGauge);
+//MODULE_REGISTER_TMP(FundtoTwoIndexSym, TFundtoHirep<TwoIndexSymmetricRepresentation>, MGauge);
+//MODULE_REGISTER_TMP(FundtoTwoIndexAsym, TFundtoHirep<TwoIndexAntiSymmetricRepresentation>, MGauge);
 
 END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_MGauge_Unit_hpp_
+#endif // Hadrons_MGauge_FundtoHirep_hpp_
diff --git a/Hadrons/Modules/MGauge/GaugeFix.cc b/Hadrons/Modules/MGauge/GaugeFix.cc
new file mode 100644
index 00000000..53aa16da
--- /dev/null
+++ b/Hadrons/Modules/MGauge/GaugeFix.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/GaugeFix.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TGaugeFix<GIMPL>;
diff --git a/Hadrons/Modules/MGauge/GaugeFix.hpp b/Hadrons/Modules/MGauge/GaugeFix.hpp
new file mode 100644
index 00000000..ece8c19d
--- /dev/null
+++ b/Hadrons/Modules/MGauge/GaugeFix.hpp
@@ -0,0 +1,135 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/GaugeFix.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGaugeFix_hpp_
+#define Hadrons_MGaugeFix_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/qcd/utils/GaugeFix.h>
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                              Fix gauge                                    *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+class GaugeFixPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugeFixPar,
+                                    std::string, gauge,
+                                    Real,  alpha,
+                                    int, maxiter, 
+                                    Real, Omega_tol, 
+                                    Real, Phi_tol,
+                                    bool, Fourier);
+};
+
+template <typename GImpl>
+class TGaugeFix: public Module<GaugeFixPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    // constructor
+    TGaugeFix(const std::string name);
+    // destructor
+    virtual ~TGaugeFix(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(GaugeFix, TGaugeFix<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TGaugeFix implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TGaugeFix<GImpl>::TGaugeFix(const std::string name)
+: Module<GaugeFixPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TGaugeFix<GImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TGaugeFix<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TGaugeFix<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TGaugeFix<GImpl>::execute(void)
+//Loads the gauge and fixes it
+{
+    std::cout << "executing" << std::endl;
+    LOG(Message) << "Fixing the Gauge" << std::endl;
+    LOG(Message) << par().gauge << std::endl;
+    auto &U     = envGet(GaugeField, par().gauge);
+    auto &Umu   = envGet(GaugeField, getName());
+    LOG(Message) << "Gauge Field fetched" << std::endl;
+    //do we allow maxiter etc to be user set?
+    Real alpha     = par().alpha;
+    int  maxiter   = par().maxiter;
+    Real Omega_tol = par().Omega_tol;
+    Real Phi_tol   = par().Phi_tol;
+    bool Fourier   = par().Fourier;
+    FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(U,alpha,maxiter,Omega_tol,Phi_tol,Fourier);
+    Umu = U;
+    LOG(Message) << "Gauge Fixed" << std::endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGaugeFix_hpp_
diff --git a/Hadrons/Modules/MGauge/Random.cc b/Hadrons/Modules/MGauge/Random.cc
new file mode 100644
index 00000000..040cd91a
--- /dev/null
+++ b/Hadrons/Modules/MGauge/Random.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Random.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MGauge/Random.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TRandom<GIMPL>;
diff --git a/Hadrons/Modules/MGauge/Random.hpp b/Hadrons/Modules/MGauge/Random.hpp
new file mode 100644
index 00000000..84b8dba0
--- /dev/null
+++ b/Hadrons/Modules/MGauge/Random.hpp
@@ -0,0 +1,112 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Random.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGauge_Random_hpp_
+#define Hadrons_MGauge_Random_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                             Random gauge                                   *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+template <typename GImpl>
+class TRandom: public Module<NoPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    // constructor
+    TRandom(const std::string name);
+    // destructor
+    virtual ~TRandom(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Random, TRandom<GIMPL>, MGauge);
+
+/******************************************************************************
+*                           TRandom implementation                            *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TRandom<GImpl>::TRandom(const std::string name)
+: Module<NoPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TRandom<GImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TRandom<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TRandom<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TRandom<GImpl>::execute(void)
+{
+    LOG(Message) << "Generating random gauge configuration" << std::endl;
+    
+    auto &U = envGet(GaugeField, getName());
+    GImpl::HotConfiguration(rng4d(), U);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGauge_Random_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/StochEm.cc b/Hadrons/Modules/MGauge/StochEm.cc
similarity index 75%
rename from extras/Hadrons/Modules/MGauge/StochEm.cc
rename to Hadrons/Modules/MGauge/StochEm.cc
index c5318573..6f8bf55e 100644
--- a/extras/Hadrons/Modules/MGauge/StochEm.cc
+++ b/Hadrons/Modules/MGauge/StochEm.cc
@@ -2,11 +2,13 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
+Source file: Hadrons/Modules/MGauge/StochEm.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -25,7 +27,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
+#include <Hadrons/Modules/MGauge/StochEm.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -57,28 +59,27 @@ std::vector<std::string> TStochEm::getOutput(void)
 // setup ///////////////////////////////////////////////////////////////////////
 void TStochEm::setup(void)
 {
-    if (!env().hasCreatedObject("_" + getName() + "_weight"))
-    {
-        envCacheLat(EmComp, "_" + getName() + "_weight");
-    }
+    weightDone_ = env().hasCreatedObject("_" + getName() + "_weight");
+    envCacheLat(EmComp, "_" + getName() + "_weight");
     envCreateLat(EmField, getName());
 }
 
 // execution ///////////////////////////////////////////////////////////////////
 void TStochEm::execute(void)
 {
-    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
+    LOG(Message) << "Generating stochastic EM potential..." << std::endl;
 
-    PhotonR photon(par().gauge, par().zmScheme);
+    std::vector<Real> improvements = strToVec<Real>(par().improvement);
+    PhotonR photon(par().gauge, par().zmScheme, improvements, par().G0_qedInf);
     auto    &a = envGet(EmField, getName());
     auto    &w = envGet(EmComp, "_" + getName() + "_weight");
     
-    if (!env().hasCreatedObject("_" + getName() + "_weight"))
+    if (!weightDone_)
     {
-        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
+        LOG(Message) << "Caching stochastic EM potential weight (gauge: "
                      << par().gauge << ", zero-mode scheme: "
                      << par().zmScheme << ")..." << std::endl;
         photon.StochasticWeight(w);
     }
-    photon.StochasticField(a, *env().get4dRng(), w);
+    photon.StochasticField(a, rng4d(), w);
 }
diff --git a/extras/Hadrons/Modules/MGauge/StochEm.hpp b/Hadrons/Modules/MGauge/StochEm.hpp
similarity index 77%
rename from extras/Hadrons/Modules/MGauge/StochEm.hpp
rename to Hadrons/Modules/MGauge/StochEm.hpp
index bacb5172..a3f8cc96 100644
--- a/extras/Hadrons/Modules/MGauge/StochEm.hpp
+++ b/Hadrons/Modules/MGauge/StochEm.hpp
@@ -2,11 +2,13 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
+Source file: Hadrons/Modules/MGauge/StochEm.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -28,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MGauge_StochEm_hpp_
 #define Hadrons_MGauge_StochEm_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -44,7 +46,9 @@ class StochEmPar: Serializable
 public:
     GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
                                     PhotonR::Gauge,    gauge,
-                                    PhotonR::ZmScheme, zmScheme);
+                                    PhotonR::ZmScheme, zmScheme,
+                                    std::string,       improvement,
+                                    Real,              G0_qedInf);
 };
 
 class TStochEm: public Module<StochEmPar>
@@ -56,7 +60,7 @@ public:
     // constructor
     TStochEm(const std::string name);
     // destructor
-    virtual ~TStochEm(void) = default;
+    virtual ~TStochEm(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -65,9 +69,11 @@ protected:
     virtual void setup(void);
     // execution
     virtual void execute(void);
+private:
+    bool    weightDone_;
 };
 
-MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
+MODULE_REGISTER(StochEm, TStochEm, MGauge);
 
 END_MODULE_NAMESPACE
 
diff --git a/Hadrons/Modules/MGauge/StoutSmearing.cc b/Hadrons/Modules/MGauge/StoutSmearing.cc
new file mode 100644
index 00000000..e7fb43d1
--- /dev/null
+++ b/Hadrons/Modules/MGauge/StoutSmearing.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/StoutSmearing.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TStoutSmearing<GIMPL>;
diff --git a/Hadrons/Modules/MGauge/StoutSmearing.hpp b/Hadrons/Modules/MGauge/StoutSmearing.hpp
new file mode 100644
index 00000000..973ac38d
--- /dev/null
+++ b/Hadrons/Modules/MGauge/StoutSmearing.hpp
@@ -0,0 +1,135 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/StoutSmearing.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MGauge_StoutSmearing_hpp_
+#define Hadrons_MGauge_StoutSmearing_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                            Stout smearing                                  *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+class StoutSmearingPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(StoutSmearingPar,
+                                    std::string, gauge,
+                                    unsigned int, steps,
+                                    double, rho);
+};
+
+template <typename GImpl>
+class TStoutSmearing: public Module<StoutSmearingPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    // constructor
+    TStoutSmearing(const std::string name);
+    // destructor
+    virtual ~TStoutSmearing(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(StoutSmearing, TStoutSmearing<GIMPL>, MGauge);
+
+/******************************************************************************
+ *                     TStoutSmearing implementation                          *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TStoutSmearing<GImpl>::TStoutSmearing(const std::string name)
+: Module<StoutSmearingPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TStoutSmearing<GImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().gauge};
+    
+    return in;
+}
+
+template <typename GImpl>
+std::vector<std::string> TStoutSmearing<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TStoutSmearing<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+    envTmpLat(GaugeField, "buf");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TStoutSmearing<GImpl>::execute(void)
+{
+    LOG(Message) << "Smearing '" << par().gauge << "' with " << par().steps
+                 << " step" << ((par().steps > 1) ? "s" : "") 
+                 << " of stout smearing and rho= " << par().rho << std::endl;
+
+    Smear_Stout<GImpl> smearer(par().rho);
+    auto               &U    = envGet(GaugeField, par().gauge);
+    auto               &Usmr = envGet(GaugeField, getName());
+
+    envGetTmp(GaugeField, buf);
+    buf = U;
+    LOG(Message) << "plaquette= " << WilsonLoops<GImpl>::avgPlaquette(U)
+                 << std::endl;
+    for (unsigned int n = 0; n < par().steps; ++n)
+    {
+        smearer.smear(Usmr, buf);
+        buf = Usmr;
+        LOG(Message) << "plaquette= " << WilsonLoops<GImpl>::avgPlaquette(Usmr)
+                     << std::endl;
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGauge_StoutSmearing_hpp_
diff --git a/Hadrons/Modules/MGauge/Unit.cc b/Hadrons/Modules/MGauge/Unit.cc
new file mode 100644
index 00000000..02ad7c0b
--- /dev/null
+++ b/Hadrons/Modules/MGauge/Unit.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Unit.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MGauge/Unit.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MGauge;
+
+template class Grid::Hadrons::MGauge::TUnit<GIMPL>;
diff --git a/Hadrons/Modules/MGauge/Unit.hpp b/Hadrons/Modules/MGauge/Unit.hpp
new file mode 100644
index 00000000..f123f25b
--- /dev/null
+++ b/Hadrons/Modules/MGauge/Unit.hpp
@@ -0,0 +1,110 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MGauge/Unit.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MGauge_Unit_hpp_
+#define Hadrons_MGauge_Unit_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                              Unit gauge                                    *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MGauge)
+
+template <typename GImpl>
+class TUnit: public Module<NoPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    // constructor
+    TUnit(const std::string name);
+    // destructor
+    virtual ~TUnit(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Unit, TUnit<GIMPL>, MGauge);
+
+/******************************************************************************
+*                            TUnit implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename GImpl>
+TUnit<GImpl>::TUnit(const std::string name)
+: Module<NoPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename GImpl>
+std::vector<std::string> TUnit<GImpl>::getInput(void)
+{
+    return std::vector<std::string>();
+}
+
+template <typename GImpl>
+std::vector<std::string> TUnit<GImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TUnit<GImpl>::setup(void)
+{
+    envCreateLat(GaugeField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename GImpl>
+void TUnit<GImpl>::execute(void)
+{
+    LOG(Message) << "Creating unit gauge configuration" << std::endl;
+    
+    auto &U = envGet(GaugeField, getName());
+    GImpl::ColdConfiguration(rng4d(), U);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MGauge_Unit_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/Unit.cc b/Hadrons/Modules/MGauge/UnitEm.cc
similarity index 72%
rename from extras/Hadrons/Modules/MGauge/Unit.cc
rename to Hadrons/Modules/MGauge/UnitEm.cc
index 8bee1ecc..d2ecad5e 100644
--- a/extras/Hadrons/Modules/MGauge/Unit.cc
+++ b/Hadrons/Modules/MGauge/UnitEm.cc
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/Unit.cc
+Source file: Hadrons/Modules/MGauge/UnitEm.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -26,28 +26,27 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
-#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
+#include <Hadrons/Modules/MGauge/UnitEm.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
 using namespace MGauge;
 
 /******************************************************************************
-*                            TUnit implementation                             *
+*                  TStochEm implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-TUnit::TUnit(const std::string name)
+TUnitEm::TUnitEm(const std::string name)
 : Module<NoPar>(name)
 {}
 
 // dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TUnit::getInput(void)
+std::vector<std::string> TUnitEm::getInput(void)
 {
     return std::vector<std::string>();
 }
 
-std::vector<std::string> TUnit::getOutput(void)
+std::vector<std::string> TUnitEm::getOutput(void)
 {
     std::vector<std::string> out = {getName()};
     
@@ -55,16 +54,16 @@ std::vector<std::string> TUnit::getOutput(void)
 }
 
 // setup ///////////////////////////////////////////////////////////////////////
-void TUnit::setup(void)
+void TUnitEm::setup(void)
 {
-    envCreateLat(LatticeGaugeField, getName());
+    envCreateLat(EmField, getName());
 }
 
 // execution ///////////////////////////////////////////////////////////////////
-void TUnit::execute(void)
+void TUnitEm::execute(void)
 {
-    LOG(Message) << "Creating unit gauge configuration" << std::endl;
-    
-    auto &U = envGet(LatticeGaugeField, getName());
-    SU3::ColdConfiguration(*env().get4dRng(), U);
+    PhotonR photon(0, 0); // Just chose arbitrary input values here
+    auto    &a = envGet(EmField, getName());
+    LOG(Message) << "Generating unit EM potential..." << std::endl;
+    photon.UnitField(a);
 }
diff --git a/extras/Hadrons/Modules/MGauge/Random.hpp b/Hadrons/Modules/MGauge/UnitEm.hpp
similarity index 71%
rename from extras/Hadrons/Modules/MGauge/Random.hpp
rename to Hadrons/Modules/MGauge/UnitEm.hpp
index a07130e4..725147b1 100644
--- a/extras/Hadrons/Modules/MGauge/Random.hpp
+++ b/Hadrons/Modules/MGauge/UnitEm.hpp
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/Random.hpp
+Source file: Hadrons/Modules/MGauge/UnitEm.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -26,28 +26,30 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#ifndef Hadrons_MGauge_UnitEm_hpp_
+#define Hadrons_MGauge_UnitEm_hpp_
 
-#ifndef Hadrons_MGauge_Random_hpp_
-#define Hadrons_MGauge_Random_hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                             Random gauge                                   *
+ *                         StochEm                                 *
  ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MGauge)
 
-class TRandom: public Module<NoPar>
+class TUnitEm: public Module<NoPar>
 {
+public:
+    typedef PhotonR::GaugeField     EmField;
+    typedef PhotonR::GaugeLinkField EmComp;
 public:
     // constructor
-    TRandom(const std::string name);
+    TUnitEm(const std::string name);
     // destructor
-    virtual ~TRandom(void) = default;
+    virtual ~TUnitEm(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -58,10 +60,10 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Random, TRandom, MGauge);
+MODULE_REGISTER(UnitEm, TUnitEm, MGauge);
 
 END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_MGauge_Random_hpp_
+#endif // Hadrons_MGauge_UnitEm_hpp_
diff --git a/Hadrons/Modules/MIO/LoadA2AVectors.cc b/Hadrons/Modules/MIO/LoadA2AVectors.cc
new file mode 100644
index 00000000..7a40a6f5
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadA2AVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadA2AVectors<FIMPL>;
diff --git a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp b/Hadrons/Modules/MIO/LoadA2AVectors.hpp
similarity index 53%
rename from extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
rename to Hadrons/Modules/MIO/LoadA2AVectors.hpp
index bb4f3f62..5b194c16 100644
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/Hadrons/Modules/MIO/LoadA2AVectors.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+Source file: Hadrons/Modules/MIO/LoadA2AVectors.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -26,80 +25,71 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#ifndef Hadrons_MIO_LoadA2AVectors_hpp_
+#define Hadrons_MIO_LoadA2AVectors_hpp_
 
-#ifndef Hadrons_MSolver_RBPrecCG_hpp_
-#define Hadrons_MSolver_RBPrecCG_hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AVectors.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                     Schur red-black preconditioned CG                      *
+ *                    Module to load all-to-all vectors                       *
  ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MSolver)
+BEGIN_MODULE_NAMESPACE(MIO)
 
-class RBPrecCGPar: Serializable
+class LoadA2AVectorsPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(RBPrecCGPar,
-                                    std::string, action,
-                                    double     , residual);
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadA2AVectorsPar,
+                                    std::string,  filestem,
+                                    bool,         multiFile,
+                                    unsigned int, size);
 };
 
 template <typename FImpl>
-class TRBPrecCG: public Module<RBPrecCGPar>
+class TLoadA2AVectors: public Module<LoadA2AVectorsPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
-    TRBPrecCG(const std::string name);
+    TLoadA2AVectors(const std::string name);
     // destructor
-    virtual ~TRBPrecCG(void) = default;
-    // dependencies/products
+    virtual ~TLoadA2AVectors(void) {};
+    // dependency relation
     virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getReference(void);
     virtual std::vector<std::string> getOutput(void);
-protected:
     // setup
     virtual void setup(void);
     // execution
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(RBPrecCG, TRBPrecCG<FIMPL>, MSolver);
+MODULE_REGISTER_TMP(LoadA2AVectors, TLoadA2AVectors<FIMPL>, MIO);
 
 /******************************************************************************
- *                      TRBPrecCG template implementation                     *
+ *                      TLoadA2AVectors implementation                        *
  ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
 template <typename FImpl>
-TRBPrecCG<FImpl>::TRBPrecCG(const std::string name)
-: Module(name)
+TLoadA2AVectors<FImpl>::TLoadA2AVectors(const std::string name)
+: Module<LoadA2AVectorsPar>(name)
 {}
 
 // dependencies/products ///////////////////////////////////////////////////////
 template <typename FImpl>
-std::vector<std::string> TRBPrecCG<FImpl>::getInput(void)
+std::vector<std::string> TLoadA2AVectors<FImpl>::getInput(void)
 {
-    std::vector<std::string> in = {};
+    std::vector<std::string> in;
     
     return in;
 }
 
 template <typename FImpl>
-std::vector<std::string> TRBPrecCG<FImpl>::getReference(void)
-{
-    std::vector<std::string> ref = {par().action};
-    
-    return ref;
-}
-
-template <typename FImpl>
-std::vector<std::string> TRBPrecCG<FImpl>::getOutput(void)
+std::vector<std::string> TLoadA2AVectors<FImpl>::getOutput(void)
 {
     std::vector<std::string> out = {getName()};
     
@@ -108,31 +98,23 @@ std::vector<std::string> TRBPrecCG<FImpl>::getOutput(void)
 
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TRBPrecCG<FImpl>::setup(void)
+void TLoadA2AVectors<FImpl>::setup(void)
 {
-    LOG(Message) << "setting up Schur red-black preconditioned CG for"
-                 << " action '" << par().action << "' with residual "
-                 << par().residual << std::endl;
-
-    auto Ls     = env().getObjectLs(par().action);
-    auto &mat   = envGet(FMat, par().action);
-    auto solver = [&mat, this](FermionField &sol, const FermionField &source)
-    {
-        ConjugateGradient<FermionField>           cg(par().residual, 10000);
-        SchurRedBlackDiagMooeeSolve<FermionField> schurSolver(cg);
-        
-        schurSolver(mat, source, sol);
-    };
-    envCreate(SolverFn, getName(), Ls, solver);
+    envCreate(std::vector<FermionField>, getName(), 1, par().size, 
+              envGetGrid(FermionField));
 }
 
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl>
-void TRBPrecCG<FImpl>::execute(void)
-{}
+void TLoadA2AVectors<FImpl>::execute(void)
+{
+    auto &vec = envGet(std::vector<FermionField>, getName());
+
+    A2AVectorsIo::read(vec, par().filestem, par().multiFile, vm().getTrajectory());
+}
 
 END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_MSolver_RBPrecCG_hpp_
+#endif // Hadrons_MIO_LoadA2AVectors_hpp_
diff --git a/Hadrons/Modules/MIO/LoadBinary.cc b/Hadrons/Modules/MIO/LoadBinary.cc
new file mode 100644
index 00000000..3f1f4fba
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadBinary.cc
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadBinary.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadBinary.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadBinary<GIMPL>;
+template class Grid::Hadrons::MIO::TLoadBinary<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MIO::TLoadBinary<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MIO::TLoadBinary<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MIO::TLoadBinary<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MIO::TLoadBinary<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MIO/LoadBinary.hpp b/Hadrons/Modules/MIO/LoadBinary.hpp
new file mode 100644
index 00000000..ec5539b4
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadBinary.hpp
@@ -0,0 +1,140 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadBinary.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MIO_LoadBinary_hpp_
+#define Hadrons_MIO_LoadBinary_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Load a binary configurations                         *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadBinaryPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadBinaryPar,
+                                    std::string, file,
+                                    std::string, format);
+};
+
+template <typename Impl>
+class TLoadBinary: public Module<LoadBinaryPar>
+{
+public:
+    typedef typename Impl::Field                  Field;
+    typedef typename Impl::Simd                   Simd;
+    typedef typename Field::vector_object         vobj;
+    typedef typename vobj::scalar_object          sobj;
+    typedef typename sobj::DoublePrecision        sobj_double;
+    typedef BinarySimpleMunger<sobj_double, sobj> Munger;
+public:
+    // constructor
+    TLoadBinary(const std::string name);
+    // destructor
+    virtual ~TLoadBinary(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadBinary, TLoadBinary<GIMPL>, MIO);
+MODULE_REGISTER_TMP(LoadBinaryScalarSU2, TLoadBinary<ScalarNxNAdjImplR<2>>, MIO);
+MODULE_REGISTER_TMP(LoadBinaryScalarSU3, TLoadBinary<ScalarNxNAdjImplR<3>>, MIO);
+MODULE_REGISTER_TMP(LoadBinaryScalarSU4, TLoadBinary<ScalarNxNAdjImplR<4>>, MIO);
+MODULE_REGISTER_TMP(LoadBinaryScalarSU5, TLoadBinary<ScalarNxNAdjImplR<5>>, MIO);
+MODULE_REGISTER_TMP(LoadBinaryScalarSU6, TLoadBinary<ScalarNxNAdjImplR<6>>, MIO);
+
+/******************************************************************************
+ *                         TLoadBinary implementation                         *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename Impl>
+TLoadBinary<Impl>::TLoadBinary(const std::string name)
+: Module<LoadBinaryPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename Impl>
+std::vector<std::string> TLoadBinary<Impl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename Impl>
+std::vector<std::string> TLoadBinary<Impl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename Impl>
+void TLoadBinary<Impl>::setup(void)
+{
+    envCreateLat(Field, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename Impl>
+void TLoadBinary<Impl>::execute(void)
+{
+    Munger      munge;
+    uint32_t    nersc_csum, scidac_csuma, scidac_csumb;
+    auto        &U = envGet(Field, getName());
+    std::string filename = par().file + "."
+                           + std::to_string(vm().getTrajectory());
+
+    LOG(Message) << "Loading " << par().format 
+                 << " binary configuration from file '" << filename
+                 << "'" << std::endl;
+    BinaryIO::readLatticeObject<vobj, sobj_double>(U, filename, munge, 0, 
+                                                   par().format, nersc_csum,
+                                                   scidac_csuma, scidac_csumb);
+    LOG(Message) << "Checksums:" << std::endl;
+    LOG(Message) << "  NERSC    " << nersc_csum << std::endl;
+    LOG(Message) << "  SciDAC A " << scidac_csuma << std::endl;
+    LOG(Message) << "  SciDAC B " << scidac_csumb << std::endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadBinary_hpp_
diff --git a/Hadrons/Modules/MIO/LoadCoarseEigenPack.cc b/Hadrons/Modules/MIO/LoadCoarseEigenPack.cc
new file mode 100644
index 00000000..ac8f2c4a
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadCoarseEigenPack.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadCoarseEigenPack.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadCoarseEigenPack<CoarseFermionEigenPack<FIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>>;
+
diff --git a/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp b/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp
new file mode 100644
index 00000000..94c1ff40
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp
@@ -0,0 +1,135 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MIO_LoadCoarseEigenPack_hpp_
+#define Hadrons_MIO_LoadCoarseEigenPack_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/EigenPack.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *              Load local coherence eigen vectors/values package             *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadCoarseEigenPackPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadCoarseEigenPackPar,
+                                    std::string, filestem,
+                                    bool,         multiFile,
+                                    unsigned int, sizeFine,
+                                    unsigned int, sizeCoarse,
+                                    unsigned int, Ls,
+                                    std::vector<int>, blockSize);
+};
+
+template <typename Pack>
+class TLoadCoarseEigenPack: public Module<LoadCoarseEigenPackPar>
+{
+public:
+    typedef CoarseEigenPack<typename Pack::Field, typename Pack::CoarseField> BasePack;
+    template <typename vtype> 
+    using iImplScalar = iScalar<iScalar<iScalar<vtype>>>;
+    typedef iImplScalar<typename Pack::Field::vector_type> SiteComplex;
+public:
+    // constructor
+    TLoadCoarseEigenPack(const std::string name);
+    // destructor
+    virtual ~TLoadCoarseEigenPack(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadCoarseFermionEigenPack, ARG(TLoadCoarseEigenPack<CoarseFermionEigenPack<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>>), MIO);
+
+/******************************************************************************
+ *                 TLoadCoarseEigenPack implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename Pack>
+TLoadCoarseEigenPack<Pack>::TLoadCoarseEigenPack(const std::string name)
+: Module<LoadCoarseEigenPackPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename Pack>
+std::vector<std::string> TLoadCoarseEigenPack<Pack>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename Pack>
+std::vector<std::string> TLoadCoarseEigenPack<Pack>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename Pack>
+void TLoadCoarseEigenPack<Pack>::setup(void)
+{
+    env().createGrid(par().Ls);
+    env().createCoarseGrid(par().blockSize, par().Ls);
+    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().sizeFine,
+                     par().sizeCoarse, env().getRbGrid(par().Ls), 
+                     env().getCoarseGrid(par().blockSize, par().Ls));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename Pack>
+void TLoadCoarseEigenPack<Pack>::execute(void)
+{
+    auto                 cg     = env().getCoarseGrid(par().blockSize, par().Ls);
+    auto                 &epack = envGetDerived(BasePack, Pack, getName());
+    Lattice<SiteComplex> dummy(cg);
+
+    epack.read(par().filestem, par().multiFile, vm().getTrajectory());
+    LOG(Message) << "Block Gramm-Schmidt pass 1"<< std::endl;
+    blockOrthogonalise(dummy, epack.evec);
+    LOG(Message) << "Block Gramm-Schmidt pass 2"<< std::endl;
+    blockOrthogonalise(dummy, epack.evec);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadCoarseEigenPack_hpp_
diff --git a/Hadrons/Modules/MIO/LoadCosmHol.cc b/Hadrons/Modules/MIO/LoadCosmHol.cc
new file mode 100644
index 00000000..9214165f
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadCosmHol.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadCosmHol.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadCosmHol<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MIO::TLoadCosmHol<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MIO::TLoadCosmHol<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MIO::TLoadCosmHol<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MIO::TLoadCosmHol<ScalarNxNAdjImplR<6>>;
diff --git a/Hadrons/Modules/MIO/LoadCosmHol.hpp b/Hadrons/Modules/MIO/LoadCosmHol.hpp
new file mode 100644
index 00000000..cd940309
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadCosmHol.hpp
@@ -0,0 +1,146 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadCosmHol.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MIO_LoadCosmHol_hpp_
+#define Hadrons_MIO_LoadCosmHol_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                    Load scalar SU(N) configurations                        *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadCosmHolPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadCosmHolPar,
+                                    std::string, file);
+};
+
+class ScalarActionParameters: Serializable 
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters,
+                                    double, mass_squared,
+                                    double, lambda,
+                                    double, g);
+};
+
+template <typename SImpl>
+class TLoadCosmHol: public Module<LoadCosmHolPar>
+{
+public:
+    typedef typename SImpl::Field Field;
+public:
+    // constructor
+    TLoadCosmHol(const std::string name);
+    // destructor
+    virtual ~TLoadCosmHol(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadCosmHolSU2, TLoadCosmHol<ScalarNxNAdjImplR<2>>, MIO);
+MODULE_REGISTER_TMP(LoadCosmHolSU3, TLoadCosmHol<ScalarNxNAdjImplR<3>>, MIO);
+MODULE_REGISTER_TMP(LoadCosmHolSU4, TLoadCosmHol<ScalarNxNAdjImplR<4>>, MIO);
+MODULE_REGISTER_TMP(LoadCosmHolSU5, TLoadCosmHol<ScalarNxNAdjImplR<5>>, MIO);
+MODULE_REGISTER_TMP(LoadCosmHolSU6, TLoadCosmHol<ScalarNxNAdjImplR<6>>, MIO);
+
+/******************************************************************************
+ *                       TLoadCosmHol implementation                          *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TLoadCosmHol<SImpl>::TLoadCosmHol(const std::string name)
+: Module<LoadCosmHolPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TLoadCosmHol<SImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TLoadCosmHol<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TLoadCosmHol<SImpl>::setup(void)
+{
+    envCreateLat(Field, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TLoadCosmHol<SImpl>::execute(void)
+{
+    ScalarActionParameters    md;
+    std::string        filename = par().file + "."
+                                  + std::to_string(vm().getTrajectory());
+    ScidacReader       reader;
+    const unsigned int N    = SImpl::Group::Dimension;
+    auto               &phi = envGet(Field, getName());
+
+    LOG(Message) << "Loading CosmHol configuration from file '" << filename
+                 << "'" << std::endl;
+    reader.open(filename);
+    reader.readScidacFieldRecord(phi, md);
+    reader.close();
+    LOG(Message) << "tr(phi^2) = " 
+                 << -TensorRemove(sum(trace(phi*phi))).real()/env().getVolume() 
+                 << std::endl;
+    LOG(Message) << "Configuration parameters:" << std::endl;
+    LOG(Message) << "     N = " << N << std::endl;
+    LOG(Message) << "   m^2 = " << md.mass_squared << std::endl;
+    LOG(Message) << "lambda = " << md.lambda << std::endl;
+    LOG(Message) << "     g = " << md.g << std::endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadCosmHol_hpp_
diff --git a/Hadrons/Modules/MIO/LoadEigenPack.cc b/Hadrons/Modules/MIO/LoadEigenPack.cc
new file mode 100644
index 00000000..28fdeb01
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadEigenPack.cc
@@ -0,0 +1,37 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadEigenPack.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL>>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MIO::TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>;
+#endif
diff --git a/Hadrons/Modules/MIO/LoadEigenPack.hpp b/Hadrons/Modules/MIO/LoadEigenPack.hpp
new file mode 100644
index 00000000..016675c9
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadEigenPack.hpp
@@ -0,0 +1,133 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadEigenPack.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MIO_LoadEigenPack_hpp_
+#define Hadrons_MIO_LoadEigenPack_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/EigenPack.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                   Load eigen vectors/values package                        *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadEigenPackPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadEigenPackPar,
+                                    std::string, filestem,
+                                    bool, multiFile,
+                                    unsigned int, size,
+                                    unsigned int, Ls);
+};
+
+template <typename Pack>
+class TLoadEigenPack: public Module<LoadEigenPackPar>
+{
+public:
+    typedef typename Pack::Field   Field;
+    typedef typename Pack::FieldIo FieldIo;
+    typedef BaseEigenPack<Field>   BasePack;
+public:
+    // constructor
+    TLoadEigenPack(const std::string name);
+    // destructor
+    virtual ~TLoadEigenPack(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadFermionEigenPack, TLoadEigenPack<FermionEigenPack<FIMPL>>, MIO);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(LoadFermionEigenPackIo32, ARG(TLoadEigenPack<FermionEigenPack<FIMPL, FIMPLF>>), MIO);
+#endif
+
+/******************************************************************************
+ *                    TLoadEigenPack implementation                           *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename Pack>
+TLoadEigenPack<Pack>::TLoadEigenPack(const std::string name)
+: Module<LoadEigenPackPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename Pack>
+std::vector<std::string> TLoadEigenPack<Pack>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename Pack>
+std::vector<std::string> TLoadEigenPack<Pack>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename Pack>
+void TLoadEigenPack<Pack>::setup(void)
+{
+    GridBase *gridIo = nullptr;
+
+    if (typeHash<Field>() != typeHash<FieldIo>())
+    {
+        gridIo = envGetRbGrid(FieldIo, par().Ls);
+    }
+    envCreateDerived(BasePack, Pack, getName(), par().Ls, par().size, 
+                     envGetRbGrid(Field, par().Ls), gridIo);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename Pack>
+void TLoadEigenPack<Pack>::execute(void)
+{
+    auto &epack = envGetDerived(BasePack, Pack, getName());
+
+    epack.read(par().filestem, par().multiFile, vm().getTrajectory());
+    epack.eval.resize(par().size);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadEigenPack_hpp_
diff --git a/Hadrons/Modules/MIO/LoadNersc.cc b/Hadrons/Modules/MIO/LoadNersc.cc
new file mode 100644
index 00000000..ce7276f9
--- /dev/null
+++ b/Hadrons/Modules/MIO/LoadNersc.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MIO/LoadNersc.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MIO/LoadNersc.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadNersc<GIMPL>;
diff --git a/extras/Hadrons/Modules/MGauge/Load.cc b/Hadrons/Modules/MIO/LoadNersc.hpp
similarity index 51%
rename from extras/Hadrons/Modules/MGauge/Load.cc
rename to Hadrons/Modules/MIO/LoadNersc.hpp
index b168a010..c0e69511 100644
--- a/extras/Hadrons/Modules/MGauge/Load.cc
+++ b/Hadrons/Modules/MIO/LoadNersc.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/Load.cc
+Source file: Hadrons/Modules/MIO/LoadNersc.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -26,30 +25,68 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#ifndef Hadrons_MIO_LoadNersc_hpp_
+#define Hadrons_MIO_LoadNersc_hpp_
 
-#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
-using namespace Grid;
-using namespace Hadrons;
-using namespace MGauge;
+BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
-*                           TLoad implementation                               *
+ *                       Load a NERSC configuration                           *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadNerscPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadNerscPar,
+                                    std::string, file);
+};
+
+template <typename GImpl>
+class TLoadNersc: public Module<LoadNerscPar>
+{
+public:
+    GAUGE_TYPE_ALIASES(GImpl,);
+public:
+    // constructor
+    TLoadNersc(const std::string name);
+    // destructor
+    virtual ~TLoadNersc(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadNersc,  TLoadNersc<GIMPL>,  MIO);
+
+/******************************************************************************
+*                       TLoadNersc implementation                             *
 ******************************************************************************/
 // constructor /////////////////////////////////////////////////////////////////
-TLoad::TLoad(const std::string name)
-: Module<LoadPar>(name)
+template <typename GImpl>
+TLoadNersc<GImpl>::TLoadNersc(const std::string name)
+: Module<LoadNerscPar>(name)
 {}
 
 // dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TLoad::getInput(void)
+template <typename GImpl>
+std::vector<std::string> TLoadNersc<GImpl>::getInput(void)
 {
     std::vector<std::string> in;
     
     return in;
 }
 
-std::vector<std::string> TLoad::getOutput(void)
+template <typename GImpl>
+std::vector<std::string> TLoadNersc<GImpl>::getOutput(void)
 {
     std::vector<std::string> out = {getName()};
     
@@ -57,13 +94,15 @@ std::vector<std::string> TLoad::getOutput(void)
 }
 
 // setup ///////////////////////////////////////////////////////////////////////
-void TLoad::setup(void)
+template <typename GImpl>
+void TLoadNersc<GImpl>::setup(void)
 {
-    envCreateLat(LatticeGaugeField, getName());
+    envCreateLat(GaugeField, getName());
 }
 
 // execution ///////////////////////////////////////////////////////////////////
-void TLoad::execute(void)
+template <typename GImpl>
+void TLoadNersc<GImpl>::execute(void)
 {
     FieldMetaData header;
     std::string   fileName = par().file + "."
@@ -71,8 +110,12 @@ void TLoad::execute(void)
     LOG(Message) << "Loading NERSC configuration from file '" << fileName
                  << "'" << std::endl;
 
-    auto &U = envGet(LatticeGaugeField, getName());
+    auto &U = envGet(GaugeField, getName());
     NerscIO::readConfiguration(U, header, fileName);
-    LOG(Message) << "NERSC header:" << std::endl;
-    dump_meta_data(header, LOG(Message));
 }
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadNersc_hpp_
diff --git a/Hadrons/Modules/MLoop/NoiseLoop.cc b/Hadrons/Modules/MLoop/NoiseLoop.cc
new file mode 100644
index 00000000..a21372fb
--- /dev/null
+++ b/Hadrons/Modules/MLoop/NoiseLoop.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MLoop/NoiseLoop.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MLoop/NoiseLoop.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MLoop;
+
+template class Grid::Hadrons::MLoop::TNoiseLoop<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp b/Hadrons/Modules/MLoop/NoiseLoop.hpp
similarity index 91%
rename from extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
rename to Hadrons/Modules/MLoop/NoiseLoop.hpp
index 512c731a..8da172d4 100644
--- a/extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+++ b/Hadrons/Modules/MLoop/NoiseLoop.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
+Source file: Hadrons/Modules/MLoop/NoiseLoop.hpp
 
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MLoop_NoiseLoop_hpp_
 #define Hadrons_MLoop_NoiseLoop_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -70,7 +71,7 @@ public:
     // constructor
     TNoiseLoop(const std::string name);
     // destructor
-    virtual ~TNoiseLoop(void) = default;
+    virtual ~TNoiseLoop(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -81,7 +82,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(NoiseLoop, TNoiseLoop<FIMPL>, MLoop);
+MODULE_REGISTER_TMP(NoiseLoop, TNoiseLoop<FIMPL>, MLoop);
 
 /******************************************************************************
  *                 TNoiseLoop implementation                                  *
diff --git a/Hadrons/Modules/MNPR/Amputate.cc b/Hadrons/Modules/MNPR/Amputate.cc
new file mode 100644
index 00000000..ec7c5940
--- /dev/null
+++ b/Hadrons/Modules/MNPR/Amputate.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Amputate.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNPR;
+
+template class Grid::Hadrons::MNPR::TAmputate<FIMPL,FIMPL>;
+
diff --git a/Hadrons/Modules/MNPR/Amputate.hpp b/Hadrons/Modules/MNPR/Amputate.hpp
new file mode 100644
index 00000000..93731bd6
--- /dev/null
+++ b/Hadrons/Modules/MNPR/Amputate.hpp
@@ -0,0 +1,200 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Amputate.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Amputate_hpp_
+#define Hadrons_Amputate_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/Eigen/LU>
+//#include <Grid/qcd/utils/PropagatorUtils.h>
+//#include <Grid/serialisation/Serialisation.h>
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                TAmputate                                       *
+        Performs bilinear contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
+        Suitable for non exceptional momenta
+******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNPR)
+
+class AmputatePar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(AmputatePar,
+                                    std::string,    Sin, //need to make this a propogator type?
+                                    std::string,    Sout, //same
+                                    std::string,    vertex,
+                                    std::string,    pin,
+                                    std::string,    pout,
+                                    std::string,    output,
+                                    std::string,    input);
+};
+
+template <typename FImpl1, typename FImpl2>
+class TAmputate: public Module<AmputatePar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<Complex>, Vamp,
+                                        ); 
+    };
+public:
+    // constructor
+    TAmputate(const std::string name);
+    // destructor
+    virtual ~TAmputate(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    virtual SpinColourMatrix invertspincolmat(SpinColourMatrix &scmat);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Amputate, ARG(TAmputate<FIMPL, FIMPL>), MNPR);
+
+/******************************************************************************
+ *                           TAmputate implementation                            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+TAmputate<FImpl1, FImpl2>::TAmputate(const std::string name)
+: Module<AmputatePar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TAmputate<FImpl1, FImpl2>::getInput(void)
+{
+    std::vector<std::string> input = {par().Sin, par().Sout, par().vertex};
+    
+    return input;
+}
+
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TAmputate<FImpl1, FImpl2>::getOutput(void)
+{
+    std::vector<std::string> output = {getName()};
+    
+    
+    return output;
+}
+
+// Invert spin colour matrix using Eigen
+template <typename Fimpl1, typename Fimpl2>
+SpinColourMatrix TAmputate<Fimpl1, Fimpl2>::invertspincolmat(SpinColourMatrix &scmat)
+{
+    Eigen::MatrixXcf scmat_2d(Ns*Nc,Ns*Nc);
+    for(int ic=0; ic<Nc; ic++){
+    for(int jc=0; jc<Nc; jc++){
+        for(int is=0; is<Ns; is++){
+        for(int js=0; js<Ns; js++){
+            scmat_2d(Ns*ic+is,Ns*jc+js) = scmat()(is,js)(ic,jc);
+        }}
+    }}      
+    Eigen::MatrixXcf scmat_2d_inv = scmat_2d.inverse();
+    SpinColourMatrix scmat_inv;
+    for(int ic=0; ic<Nc; ic++){
+    for(int jc=0; jc<Nc; jc++){
+        for(int is=0; is<Ns; is++){
+        for(int js=0; js<Ns; js++){
+            scmat_inv()(is,js)(ic,jc) = scmat_2d_inv(Ns*ic+is,Ns*jc+js);
+        }}
+    }}      
+    return scmat_inv;
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TAmputate<FImpl1, FImpl2>::execute(void)
+{
+    LOG(Message) << "Computing bilinear amputations '" << getName() << "' using"
+                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
+                 << std::endl;
+    BinaryWriter                    writer(par().output);
+    PropagatorField1                &Sin = *env().template getObject<PropagatorField1>(par().Sin); //Do these have the phases taken into account?? Don't think so. FIX
+    PropagatorField2                &Sout = *env().template getObject<PropagatorField2>(par().Sout);
+    std::vector<int>                pin  = strToVec<int>(par().pin), pout = strToVec<int>(par().pout);
+    std::vector<Real>               latt_size(pin.begin(), pin.end()); 
+    LatticeComplex                  pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
+    LOG(Message) << "Propagators set up " << std::endl;
+    std::vector<SpinColourMatrix>   vertex; // Let's read from file here
+    Gamma                           g5(Gamma::Algebra::Gamma5);
+    Result                          result;
+    LOG(Message) << "reading file - "  << par().input << std::endl;
+    BinaryReader                    reader(par().input); 
+    Complex                         Ci(0.0,1.0);
+
+    std::string svertex;
+    read(reader,"vertex", vertex);
+    LOG(Message) << "vertex read" << std::endl;
+
+    pdotxin=Zero();
+    pdotxout=Zero();
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
+        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
+    }
+    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
+    Sout = Sout*exp(-Ci*pdotxout);
+
+    SpinColourMatrix Sin_mom = sum(Sin);
+    SpinColourMatrix Sout_mom = sum(Sout);
+    LOG(Message) << "summed over lattice" << std::endl;
+   
+    LOG(Message) << "Lattice -> spincolourmatrix conversion" << std::endl;
+
+    SpinColourMatrix Sin_inv = invertspincolmat(Sin_mom);
+    SpinColourMatrix Sout_inv = invertspincolmat(Sout_mom);
+    LOG(Message) << "Inversions done" << std::endl;
+
+    result.Vamp.resize(Gamma::nGamma/2);
+    for( int mu=0; mu < Gamma::nGamma/2; mu++){
+        Gamma::Algebra gam = mu;
+        result.Vamp[mu] = 1/12.0*trace(adj(Gamma(mu*2+1))*g5*Sout_inv*g5*vertex[mu]*Sin_inv);
+        LOG(Message) << "Vamp[" << mu << "] - " << result.Vamp[mu] << std::endl;
+        }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Amputate_hpp_
diff --git a/Hadrons/Modules/MNPR/Bilinear.cc b/Hadrons/Modules/MNPR/Bilinear.cc
new file mode 100644
index 00000000..c5b38d37
--- /dev/null
+++ b/Hadrons/Modules/MNPR/Bilinear.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Bilinear.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNPR/Bilinear.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNPR;
+
+template class Grid::Hadrons::MNPR::TBilinear<FIMPL,FIMPL>;
+
diff --git a/Hadrons/Modules/MNPR/Bilinear.hpp b/Hadrons/Modules/MNPR/Bilinear.hpp
new file mode 100644
index 00000000..e01837a6
--- /dev/null
+++ b/Hadrons/Modules/MNPR/Bilinear.hpp
@@ -0,0 +1,225 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/Bilinear.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_Bilinear_hpp_
+#define Hadrons_Bilinear_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+//#include <Grid/qcd/utils/PropagatorUtils.h>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                TBilinear                                       *
+        Performs bilinear contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
+        Suitable for non exceptional momenta in Rome-Southampton NPR
+******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNPR)
+
+class BilinearPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(BilinearPar,
+                                    std::string,    Sin,
+                                    std::string,    Sout,
+                                    std::string,    pin,
+                                    std::string,    pout,
+                                    std::string,    output);
+};
+
+template <typename FImpl1, typename FImpl2>
+class TBilinear: public Module<BilinearPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result, 
+                                        std::vector<SpinColourMatrix>, bilinear);
+    };
+public:
+    // constructor
+    TBilinear(const std::string name);
+    // destructor
+    virtual ~TBilinear(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    //LatticeSpinColourMatrix PhaseProps(LatticeSpinColourMatrix S, std::vector<Real> p);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Bilinear, ARG(TBilinear<FIMPL, FIMPL>), MNPR);
+
+/******************************************************************************
+ *                           TBilinear implementation                            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+TBilinear<FImpl1, FImpl2>::TBilinear(const std::string name)
+: Module<BilinearPar>(name)
+{}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TBilinear<FImpl1, FImpl2>::setup(void)
+{
+    //env().template registerLattice<LatticeSpinColourMatrix>(getName());
+    //env().template registerObject<SpinColourMatrix>(getName());
+}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TBilinear<FImpl1, FImpl2>::getInput(void)
+{
+    std::vector<std::string> input = {par().Sin, par().Sout};
+    
+    return input;
+}
+
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TBilinear<FImpl1, FImpl2>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+/*
+/////Phase propagators//////////////////////////
+template <typename FImpl1, typename FImpl2>
+LatticeSpinColourMatrix TBilinear<FImpl1, FImpl2>::PhaseProps(LatticeSpinColourMatrix S, std::vector<Real> p)
+{
+    GridBase *grid = S._grid;
+    LatticeComplex      pdotx(grid),  coor(grid);
+    std::vector<int>   latt_size = grid->_fdimensions; 
+    Complex             Ci(0.0,1.0);
+    pdotx=Zero();
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotx = pdotx +(TwoPiL * p[mu]) * coor;
+    }
+    S = S*exp(-Ci*pdotx);
+    return S;
+}
+*/
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TBilinear<FImpl1, FImpl2>::execute(void)
+{
+/**************************************************************************
+
+Compute the bilinear vertex needed for the NPR.
+V(G) = sum_x  [ g5 * adj(S'(x,p2)) * g5 * G * S'(x,p1) ]_{si,sj,ci,cj}
+G is one of the 16 gamma vertices [I,gmu,g5,g5gmu,sig(mu,nu)]
+
+        * G
+       / \
+    p1/   \p2
+     /     \
+    /       \
+
+Returns a spin-colour matrix, with indices si,sj, ci,cj
+
+Conventions:
+p1 - incoming momenta
+p2 - outgoing momenta
+q = (p1-p2)
+**************************************************************************/
+
+    LOG(Message) << "Computing bilinear contractions '" << getName() << "' using"
+                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
+                 << std::endl;
+     
+    BinaryWriter             writer(par().output);
+    
+
+    // Propogators
+    LatticeSpinColourMatrix     &Sin = *env().template getObject<LatticeSpinColourMatrix>(par().Sin);
+    LatticeSpinColourMatrix     &Sout = *env().template getObject<LatticeSpinColourMatrix>(par().Sout);
+    LatticeComplex              pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
+    // momentum on legs
+    std::vector<Real>           pin  = strToVec<Real>(par().pin), pout = strToVec<Real>(par().pout);
+    std::vector<Real>           latt_size(pin.begin(), pin.end()); 
+    //bilinears
+    LatticeSpinColourMatrix     bilinear_x(env().getGrid());
+    SpinColourMatrix            bilinear;
+    Gamma                       g5(Gamma::Algebra::Gamma5);
+    Result                      result;
+    Complex                     Ci(0.0,1.0);
+
+    //
+
+    pdotxin=Zero();
+    pdotxout=Zero();
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
+        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
+    }
+    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
+    Sout = Sout*exp(-Ci*pdotxout);
+    
+    ////Set up gamma vector//////////////////////////
+    std::vector<Gamma> gammavector;
+    for( int i=0; i<Gamma::nGamma; i++){
+        Gamma::Algebra gam = i;
+        gammavector.push_back(Gamma(gam));
+    }
+    result.bilinear.resize(Gamma::nGamma);
+    /////////////////////////////////////////////////
+    //LatticeSpinMatrix temp = g5*Sout;
+    ////////Form Vertex//////////////////////////////
+    for (int i=0; i < Gamma::nGamma; i++){
+        bilinear_x = g5*adj(Sout)*g5*gammavector[i]*Sin; 
+        result.bilinear[i] = sum(bilinear_x); //sum over lattice sites
+    }
+    //////////////////////////////////////////////////
+    write(writer, par().output, result.bilinear);
+    LOG(Message) << "Complete. Writing results to " << par().output << std:: endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Bilinear_hpp_
diff --git a/Hadrons/Modules/MNPR/FourQuark.cc b/Hadrons/Modules/MNPR/FourQuark.cc
new file mode 100644
index 00000000..1943c25b
--- /dev/null
+++ b/Hadrons/Modules/MNPR/FourQuark.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/FourQuark.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNPR/FourQuark.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNPR;
+
+template class Grid::Hadrons::MNPR::TFourQuark<FIMPL,FIMPL>;
+
diff --git a/Hadrons/Modules/MNPR/FourQuark.hpp b/Hadrons/Modules/MNPR/FourQuark.hpp
new file mode 100644
index 00000000..d56eb7a9
--- /dev/null
+++ b/Hadrons/Modules/MNPR/FourQuark.hpp
@@ -0,0 +1,277 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNPR/FourQuark.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Julia Kettle J.R.Kettle-2@sms.ed.ac.uk
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_FourQuark_hpp_
+#define Hadrons_FourQuark_hpp_
+
+#include <typeinfo>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Grid/serialisation/Serialisation.h>
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                                TFourQuark                                       *
+        Performs fourquark contractions of the type tr[g5*adj(Sout)*g5*G*Sin]
+        Suitable for non exceptional momenta
+******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNPR)
+
+class FourQuarkPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FourQuarkPar,
+                                    std::string,    Sin, //need to make this a propogator type?
+                                    std::string,    Sout, //same
+                                    std::string,    pin,
+                                    std::string,    pout,
+                                    bool,           fullbasis,
+                                    std::string,    output);
+};
+
+template <typename FImpl1, typename FImpl2>
+class TFourQuark: public Module<FourQuarkPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl1, 1);
+    FERM_TYPE_ALIASES(FImpl2, 2);
+    class Result: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<SpinColourSpinColourMatrix>, fourquark);
+    };
+public:
+    // constructor
+    TFourQuark(const std::string name);
+    // destructor
+    virtual ~TFourQuark(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void tensorprod(LatticeSpinColourSpinColourMatrix &lret, LatticeSpinColourMatrix a, LatticeSpinColourMatrix b);
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(FourQuark, ARG(TFourQuark<FIMPL, FIMPL>), MNPR);
+
+/******************************************************************************
+ *                           TFourQuark implementation                            *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+TFourQuark<FImpl1, FImpl2>::TFourQuark(const std::string name)
+: Module<FourQuarkPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TFourQuark<FImpl1, FImpl2>::getInput(void)
+{
+    std::vector<std::string> input = {par().Sin, par().Sout};
+    
+    return input;
+}
+
+template <typename FImpl1, typename FImpl2>
+std::vector<std::string> TFourQuark<FImpl1, FImpl2>::getOutput(void)
+{
+    std::vector<std::string> output = {getName()};
+    
+    return output;
+}
+
+
+template <typename FImpl1, typename FImpl2>
+void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &lret, LatticeSpinColourMatrix a, LatticeSpinColourMatrix b)
+{
+#if 0
+            parallel_for(auto site=lret.begin();site<lret.end();site++) {
+                for (int si; si < 4; ++si){
+                for(int sj; sj <4; ++sj){
+                    for (int ci; ci < 3; ++ci){
+                    for (int cj; cj < 3; ++cj){
+                        for (int sk; sk < 4; ++sk){
+                        for(int sl; sl <4; ++sl){
+                            for (int ck; ck < 3; ++ck){
+                            for (int cl; cl < 3; ++cl){
+                        lret[site]()(si,sj)(ci,cj)(sk,sl)(ck,cl)=a[site]()(si,sj)(ci,cj)*b[site]()(sk,sl)(ck,cl);
+                            }}
+                        }}
+                    }}
+                }}
+        }
+#else 
+            // FIXME ; is there a general need for this construct ? In which case we should encapsulate the
+            //         below loops in a helper function.
+            //LOG(Message) << "sp co mat a is - " << a << std::endl;
+            //LOG(Message) << "sp co mat b is - " << b << std::endl;
+	    auto  lret_v = lret.View();
+	    auto  a_v = a.View();
+	    auto  b_v = b.View();
+            parallel_for(auto site=lret_v.begin();site<lret_v.end();site++) {
+            vTComplex left;
+                for(int si=0; si < Ns; ++si){
+                for(int sj=0; sj < Ns; ++sj){
+                    for (int ci=0; ci < Nc; ++ci){
+                    for (int cj=0; cj < Nc; ++cj){
+                      //LOG(Message) << "si, sj, ci, cj -  " << si << ", " << sj  << ", "<< ci  << ", "<< cj << std::endl;
+                      left()()() = a_v[site]()(si,sj)(ci,cj);
+                      //LOG(Message) << left << std::endl;
+                      lret_v[site]()(si,sj)(ci,cj)=left()*b_v[site]();
+                    }}
+                }}
+            }
+#endif      
+}
+
+
+
+
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TFourQuark<FImpl1, FImpl2>::setup(void)
+{
+    envCreateLat(LatticeSpinColourMatrix, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl1, typename FImpl2>
+void TFourQuark<FImpl1, FImpl2>::execute(void)
+{
+
+/*********************************************************************************
+
+TFourQuark : Creates the four quark vertex required for the NPR of four-quark ops
+
+V_{Gamma_1,Gamma_2} = sum_x [ ( g5 * adj(S'(x,p2)) * g5 * G1 * S'(x,p1) )_ci,cj;si,sj x ( g5 * adj(S'(x,p2)) * g5 * G2 S'(x,p1) )_ck,cl;sk,cl ]
+
+Create a bilinear vertex for G1 and G2  the spin and colour indices are kept free. Where there are 16 potential Gs.
+We then find the outer product of V1 and V2, keeping the spin and colour indices uncontracted
+Then this is summed over the lattice coordinate
+Result is a SpinColourSpinColourMatrix - with 4 colour and 4 spin indices. 
+We have up to 256 of these including the offdiag (G1 != G2).
+
+        \         /
+         \p1   p1/
+          \     /
+           \   /
+         G1 * * G2
+           /   \
+          /     \
+         /p2   p2\
+        /         \
+
+*********************************************************************************/
+
+
+
+
+    LOG(Message) << "Computing fourquark contractions '" << getName() << "' using"
+                 << " momentum '" << par().Sin << "' and '" << par().Sout << "'"
+                 << std::endl;
+    
+    BinaryWriter             writer(par().output);
+    
+    PropagatorField1                            &Sin = *env().template getObject<PropagatorField1>(par().Sin);
+    PropagatorField2                            &Sout = *env().template getObject<PropagatorField2>(par().Sout);
+    std::vector<Real>                           pin  = strToVec<Real>(par().pin), pout = strToVec<Real>(par().pout);
+    bool                                        fullbasis = par().fullbasis;
+    Gamma                                       g5(Gamma::Algebra::Gamma5);
+    Result                                      result;
+    std::vector<Real>                           latt_size(pin.begin(), pin.end());
+    LatticeComplex                              pdotxin(env().getGrid()), pdotxout(env().getGrid()), coor(env().getGrid());
+    LatticeSpinColourMatrix                     bilinear_mu(env().getGrid()), bilinear_nu(env().getGrid());
+    LatticeSpinColourSpinColourMatrix           lret(env().getGrid()); 
+    Complex                         Ci(0.0,1.0);
+
+    //Phase propagators
+    //Sin = Grid::QCD::PropUtils::PhaseProps(Sin,pin);
+    //Sout = Grid::QCD::PropUtils::PhaseProps(Sout,pout);
+    
+    //find p.x for in and out so phase can be accounted for in propagators
+    pdotxin=Zero();
+    pdotxout=Zero();
+    for (unsigned int mu = 0; mu < 4; ++mu)
+    {
+        Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+        LatticeCoordinate(coor,mu);
+        pdotxin = pdotxin +(TwoPiL * pin[mu]) * coor;
+        pdotxout= pdotxout +(TwoPiL * pout[mu]) * coor;
+    }
+    Sin = Sin*exp(-Ci*pdotxin); //phase corrections
+    Sout = Sout*exp(-Ci*pdotxout);
+
+
+    //Set up Gammas 
+    std::vector<Gamma> gammavector;
+     for( int i=1; i<Gamma::nGamma; i+=2){
+         Gamma::Algebra gam = i;
+         gammavector.push_back(Gamma(gam));
+       }
+    
+    lret = Zero();
+    if (fullbasis == true){ // all combinations of mu and nu
+        result.fourquark.resize(Gamma::nGamma/2*Gamma::nGamma/2);
+        for( int mu=0; mu<Gamma::nGamma/2; mu++){ 
+            bilinear_mu = g5*adj(Sout)*g5*gammavector[mu]*Sin;
+            for ( int nu=0; nu<Gamma::nGamma; nu++){
+                LatticeSpinColourMatrix     bilinear_nu(env().getGrid());
+                bilinear_nu = g5*adj(Sout)*g5*gammavector[nu]*Sin;
+                LOG(Message) << "bilinear_nu for nu = " << nu << " is - " << bilinear_mu << std::endl;
+                result.fourquark[mu*Gamma::nGamma/2 + nu] = Zero();
+                tensorprod(lret,bilinear_mu,bilinear_nu);
+                result.fourquark[mu*Gamma::nGamma/2 + nu] = sum(lret);
+            }
+        }
+    } else {
+        result.fourquark.resize(Gamma::nGamma/2);
+        for ( int mu=0; mu<1; mu++){
+        //for( int mu=0; mu<Gamma::nGamma/2; mu++ ){
+            bilinear_mu = g5*adj(Sout)*g5*gammavector[mu]*Sin;
+            //LOG(Message) << "bilinear_mu for mu = " << mu << " is - " << bilinear_mu << std::endl;
+            result.fourquark[mu] = Zero();
+            tensorprod(lret,bilinear_mu,bilinear_mu); //tensor outer product
+            result.fourquark[mu] = sum(lret);
+        }
+    }
+    write(writer, "fourquark", result.fourquark);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_FourQuark_hpp_
diff --git a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
new file mode 100644
index 00000000..b909bf92
--- /dev/null
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNoise;
+
+template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<FIMPL>;
+template class Grid::Hadrons::MNoise::TFullVolumeSpinColorDiagonal<ZFIMPL>;
diff --git a/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
new file mode 100644
index 00000000..93990882
--- /dev/null
+++ b/Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
@@ -0,0 +1,121 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
+#define Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/DilutedNoise.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *             Generate full volume spin-color diagonal noise                *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNoise)
+
+class FullVolumeSpinColorDiagonalPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FullVolumeSpinColorDiagonalPar,
+                                    unsigned int, nsrc);
+};
+
+template <typename FImpl>
+class TFullVolumeSpinColorDiagonal: public Module<FullVolumeSpinColorDiagonalPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TFullVolumeSpinColorDiagonal(const std::string name);
+    // destructor
+    virtual ~TFullVolumeSpinColorDiagonal(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(FullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<FIMPL>, MNoise);
+MODULE_REGISTER_TMP(ZFullVolumeSpinColorDiagonal, TFullVolumeSpinColorDiagonal<ZFIMPL>, MNoise);
+
+/******************************************************************************
+ *              TFullVolumeSpinColorDiagonal implementation                  *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TFullVolumeSpinColorDiagonal<FImpl>::TFullVolumeSpinColorDiagonal(const std::string name)
+: Module<FullVolumeSpinColorDiagonalPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TFullVolumeSpinColorDiagonal<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFullVolumeSpinColorDiagonal<FImpl>::setup(void)
+{
+    envCreateDerived(DilutedNoise<FImpl>, 
+                     FullVolumeSpinColorDiagonalNoise<FImpl>,
+                     getName(), 1, envGetGrid(FermionField), par().nsrc);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TFullVolumeSpinColorDiagonal<FImpl>::execute(void)
+{
+    auto &noise = envGet(DilutedNoise<FImpl>, getName());
+    LOG(Message) << "Generating full volume, spin-color diagonal noise" << std::endl;
+    noise.generateNoise(rng4d());
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MNoise_FullVolumeSpinColorDiagonal_hpp_
diff --git a/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.cc b/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.cc
new file mode 100644
index 00000000..52d2b62d
--- /dev/null
+++ b/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MNoise;
+
+template class Grid::Hadrons::MNoise::TTimeDilutedSpinColorDiagonal<FIMPL>;
+template class Grid::Hadrons::MNoise::TTimeDilutedSpinColorDiagonal<ZFIMPL>;
diff --git a/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp b/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
new file mode 100644
index 00000000..da634d6c
--- /dev/null
+++ b/Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
@@ -0,0 +1,114 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MNoise_TimeDilutedSpinColorDiagonal_hpp_
+#define Hadrons_MNoise_TimeDilutedSpinColorDiagonal_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/DilutedNoise.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *             Generate time diluted spin-color diagonal noise                *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MNoise)
+
+template <typename FImpl>
+class TTimeDilutedSpinColorDiagonal: public Module<NoPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TTimeDilutedSpinColorDiagonal(const std::string name);
+    // destructor
+    virtual ~TTimeDilutedSpinColorDiagonal(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(TimeDilutedSpinColorDiagonal, TTimeDilutedSpinColorDiagonal<FIMPL>, MNoise);
+MODULE_REGISTER_TMP(ZTimeDilutedSpinColorDiagonal, TTimeDilutedSpinColorDiagonal<ZFIMPL>, MNoise);
+
+/******************************************************************************
+ *              TTimeDilutedSpinColorDiagonal implementation                  *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TTimeDilutedSpinColorDiagonal<FImpl>::TTimeDilutedSpinColorDiagonal(const std::string name)
+: Module<NoPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TTimeDilutedSpinColorDiagonal<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TTimeDilutedSpinColorDiagonal<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TTimeDilutedSpinColorDiagonal<FImpl>::setup(void)
+{
+    envCreateDerived(DilutedNoise<FImpl>, 
+                     TimeDilutedSpinColorDiagonalNoise<FImpl>,
+                     getName(), 1, envGetGrid(FermionField));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TTimeDilutedSpinColorDiagonal<FImpl>::execute(void)
+{
+    auto &noise = envGet(DilutedNoise<FImpl>, getName());
+
+    LOG(Message) << "Generating time-diluted, spin-color diagonal noise" << std::endl;
+    noise.generateNoise(rng4d());
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MNoise_TimeDilutedSpinColorDiagonal_hpp_
diff --git a/Hadrons/Modules/MScalar/ChargedProp.cc b/Hadrons/Modules/MScalar/ChargedProp.cc
new file mode 100644
index 00000000..f56dfeda
--- /dev/null
+++ b/Hadrons/Modules/MScalar/ChargedProp.cc
@@ -0,0 +1,312 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/ChargedProp.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalar;
+
+/******************************************************************************
+*                     TChargedProp implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+TChargedProp::TChargedProp(const std::string name)
+: Module<ChargedPropPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+std::vector<std::string> TChargedProp::getInput(void)
+{
+    std::vector<std::string> in = {par().source, par().emField};
+    
+    return in;
+}
+
+std::vector<std::string> TChargedProp::getOutput(void)
+{
+    std::vector<std::string> out = {getName(), getName()+"_0", getName()+"_Q",
+                                    getName()+"_Sun", getName()+"_Tad"};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+void TChargedProp::setup(void)
+{
+    freeMomPropName_ = FREEMOMPROP(par().mass);
+    phaseName_.clear();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
+    }
+    GFSrcName_ = getName() + "_DinvSrc";
+	prop0Name_ = getName() + "_0";
+    propQName_ = getName() + "_Q";
+    propSunName_ = getName() + "_Sun";
+    propTadName_ = getName() + "_Tad";
+    fftName_   = getName() + "_fft";
+
+    freeMomPropDone_ = env().hasCreatedObject(freeMomPropName_);
+    GFSrcDone_       = env().hasCreatedObject(GFSrcName_);
+    phasesDone_      = env().hasCreatedObject(phaseName_[0]);
+	prop0Done_		 = env().hasCreatedObject(prop0Name_);
+    envCacheLat(ScalarField, freeMomPropName_);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        envCacheLat(ScalarField, phaseName_[mu]);
+    }
+    envCacheLat(ScalarField, GFSrcName_);
+	envCacheLat(ScalarField, prop0Name_);
+    envCreateLat(ScalarField, getName());
+    envCreateLat(ScalarField, propQName_);
+    envCreateLat(ScalarField, propSunName_);
+    envCreateLat(ScalarField, propTadName_);
+    envTmpLat(ScalarField, "buf");
+    envTmpLat(ScalarField, "result");
+    envTmpLat(ScalarField, "Amu");
+    envCache(FFT, fftName_, 1, env().getGrid());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+void TChargedProp::execute(void)
+{
+    // CACHING ANALYTIC EXPRESSIONS
+    makeCaches();
+
+    // PROPAGATOR CALCULATION
+    LOG(Message) << "Computing charged scalar propagator"
+                 << " (mass= " << par().mass
+                 << ", charge= " << par().charge << ")..." << std::endl;
+    
+    auto   &prop    = envGet(ScalarField, getName());
+	auto   &prop0   = envGet(ScalarField, prop0Name_);
+	auto   &propQ   = envGet(ScalarField, propQName_);
+	auto   &propSun = envGet(ScalarField, propSunName_);
+	auto   &propTad = envGet(ScalarField, propTadName_);
+    auto   &GFSrc   = envGet(ScalarField, GFSrcName_);
+    auto   &G       = envGet(ScalarField, freeMomPropName_);
+    auto   &fft     = envGet(FFT, fftName_);
+    double q        = par().charge;
+    envGetTmp(ScalarField, buf);
+
+    // -G*momD1*G*F*Src (momD1 = F*D1*Finv)
+    propQ = GFSrc;
+    momD1(propQ, fft);
+    propQ = -G*propQ;
+    propSun = -propQ;
+    fft.FFT_dim(propQ, propQ, env().getNd()-1, FFT::backward);
+
+    // G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
+    momD1(propSun, fft);
+    propSun = G*propSun;
+    fft.FFT_dim(propSun, propSun, env().getNd()-1, FFT::backward);
+
+    // -G*momD2*G*F*Src (momD2 = F*D2*Finv)
+    propTad = GFSrc;
+    momD2(propTad, fft);
+    propTad = -G*propTad;
+    fft.FFT_dim(propTad, propTad, env().getNd()-1, FFT::backward);
+    
+    // full charged scalar propagator
+    fft.FFT_dim(buf, GFSrc, env().getNd()-1, FFT::backward);
+    prop = buf + q*propQ + q*q*propSun + q*q*propTad;
+
+    // OUTPUT IF NECESSARY
+    if (!par().output.empty())
+    {
+        Result result;
+        TComplex            site;
+        std::vector<int>    siteCoor;
+
+        LOG(Message) << "Saving momentum-projected propagator to '"
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
+                     << std::endl;
+        result.projection.resize(par().outputMom.size());
+        result.lattice_size = env().getGrid()->FullDimensions().toVector();
+        result.mass = par().mass;
+        result.charge = q;
+        siteCoor.resize(env().getNd());
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            result.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
+
+            LOG(Message) << "Calculating (" << par().outputMom[i_p]
+                         << ") momentum projection" << std::endl;
+
+            result.projection[i_p].corr_0.resize(env().getGrid()->FullDimensions()[env().getNd()-1]);
+            result.projection[i_p].corr.resize(env().getGrid()->FullDimensions()[env().getNd()-1]);
+            result.projection[i_p].corr_Q.resize(env().getGrid()->FullDimensions()[env().getNd()-1]);
+            result.projection[i_p].corr_Sun.resize(env().getGrid()->FullDimensions()[env().getNd()-1]);
+            result.projection[i_p].corr_Tad.resize(env().getGrid()->FullDimensions()[env().getNd()-1]);
+
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                siteCoor[j] = result.projection[i_p].momentum[j];
+            }
+
+            for (unsigned int t = 0; t < result.projection[i_p].corr.size(); ++t)
+            {
+                siteCoor[env().getNd()-1] = t;
+                peekSite(site, prop, siteCoor);
+                result.projection[i_p].corr[t]=TensorRemove(site);
+                peekSite(site, buf, siteCoor);
+                result.projection[i_p].corr_0[t]=TensorRemove(site);
+                peekSite(site, propQ, siteCoor);
+                result.projection[i_p].corr_Q[t]=TensorRemove(site);
+                peekSite(site, propSun, siteCoor);
+                result.projection[i_p].corr_Sun[t]=TensorRemove(site);
+                peekSite(site, propTad, siteCoor);
+                result.projection[i_p].corr_Tad[t]=TensorRemove(site);
+            }
+        }
+        saveResult(par().output, "prop", result);
+    }
+
+    std::vector<int> mask(env().getNd(),1);
+    mask[env().getNd()-1] = 0;
+    fft.FFT_dim_mask(prop, prop, mask, FFT::backward);
+    fft.FFT_dim_mask(propQ, propQ, mask, FFT::backward);
+    fft.FFT_dim_mask(propSun, propSun, mask, FFT::backward);
+    fft.FFT_dim_mask(propTad, propTad, mask, FFT::backward);
+}
+
+void TChargedProp::makeCaches(void)
+{
+    auto &freeMomProp = envGet(ScalarField, freeMomPropName_);
+    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
+	auto &prop0		  = envGet(ScalarField, prop0Name_);
+    auto &fft         = envGet(FFT, fftName_);
+
+    if (!freeMomPropDone_)
+    {
+        LOG(Message) << "Caching momentum-space free scalar propagator"
+                     << " (mass= " << par().mass << ")..." << std::endl;
+        SIMPL::MomentumSpacePropagator(freeMomProp, par().mass);
+    }
+    if (!GFSrcDone_)
+    {   
+        auto &source = envGet(ScalarField, par().source);
+        
+        LOG(Message) << "Caching G*F*src..." << std::endl;
+        fft.FFT_all_dim(GFSrc, source, FFT::forward);
+        GFSrc = freeMomProp*GFSrc;
+    }
+	if (!prop0Done_)
+	{
+		LOG(Message) << "Caching position-space free scalar propagator..."
+                     << std::endl;
+		fft.FFT_all_dim(prop0, GFSrc, FFT::backward);
+	}
+    if (!phasesDone_)
+    {
+        auto l = env().getGrid()->FullDimensions();
+        Complex          ci(0.0,1.0);
+        
+        LOG(Message) << "Caching shift phases..." << std::endl;
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            Real twoPiL = M_PI*2./l[mu];
+            auto &phmu  = envGet(ScalarField, phaseName_[mu]);
+            
+            LatticeCoordinate(phmu, mu);
+            phmu = exp(ci*twoPiL*phmu);
+            phase_.push_back(&phmu);
+        }
+    }
+    else
+    {
+        phase_.clear();
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
+        }
+    }
+}
+
+void TChargedProp::momD1(ScalarField &s, FFT &fft)
+{
+    auto        &A = envGet(EmField, par().emField);
+    Complex     ci(0.0,1.0);
+
+    envGetTmp(ScalarField, buf);
+    envGetTmp(ScalarField, result);
+    envGetTmp(ScalarField, Amu);
+
+    result = Zero();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = (*phase_[mu])*s;
+        fft.FFT_all_dim(buf, buf, FFT::backward);
+        buf = Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result - ci*buf;
+    }
+    fft.FFT_all_dim(s, s, FFT::backward);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = Amu*s;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result + ci*adj(*phase_[mu])*buf;
+    }
+
+    s = result;
+}
+
+void TChargedProp::momD2(ScalarField &s, FFT &fft)
+{
+    auto &A = envGet(EmField, par().emField);
+
+    envGetTmp(ScalarField, buf);
+    envGetTmp(ScalarField, result);
+    envGetTmp(ScalarField, Amu);
+
+    result = Zero();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = (*phase_[mu])*s;
+        fft.FFT_all_dim(buf, buf, FFT::backward);
+        buf = Amu*Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result + .5*buf;
+    }
+    fft.FFT_all_dim(s, s, FFT::backward);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);        
+        buf = Amu*Amu*s;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result + .5*adj(*phase_[mu])*buf;
+    }
+
+    s = result;
+}
diff --git a/Hadrons/Modules/MScalar/ChargedProp.hpp b/Hadrons/Modules/MScalar/ChargedProp.hpp
new file mode 100644
index 00000000..443c38a6
--- /dev/null
+++ b/Hadrons/Modules/MScalar/ChargedProp.hpp
@@ -0,0 +1,113 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/ChargedProp.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalar_ChargedProp_hpp_
+#define Hadrons_MScalar_ChargedProp_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Charged scalar propagator                            *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalar)
+
+class ChargedPropPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
+                                    std::string, emField,
+                                    std::string, source,
+                                    double,      mass,
+                                    double,      charge,
+                                    std::string, output,
+                                    std::vector<std::string>, outputMom);
+};
+
+class TChargedProp: public Module<ChargedPropPar>
+{
+public:
+    BASIC_TYPE_ALIASES(SIMPL,);
+    typedef PhotonR::GaugeField     EmField;
+    typedef PhotonR::GaugeLinkField EmComp;
+    class Result: Serializable
+    {
+    public:
+        class Projection: Serializable
+        {
+        public:
+            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
+                                            std::vector<int>,     momentum,
+                                            std::vector<Complex>, corr,
+                                            std::vector<Complex>, corr_0,
+                                            std::vector<Complex>, corr_Q,
+                                            std::vector<Complex>, corr_Sun,
+                                            std::vector<Complex>, corr_Tad);
+        };
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<int>,        lattice_size,
+                                        double,                  mass,
+                                        double,                  charge,
+                                        std::vector<Projection>, projection);
+    };
+public:
+    // constructor
+    TChargedProp(const std::string name);
+    // destructor
+    virtual ~TChargedProp(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    void makeCaches(void);
+    void momD1(ScalarField &s, FFT &fft);
+    void momD2(ScalarField &s, FFT &fft);
+private:
+    bool                       freeMomPropDone_, GFSrcDone_, prop0Done_,
+                               phasesDone_;
+    std::string                freeMomPropName_, GFSrcName_, prop0Name_,
+                               propQName_, propSunName_, propTadName_, fftName_;
+    std::vector<std::string>   phaseName_;
+    std::vector<ScalarField *> phase_;
+};
+
+MODULE_REGISTER(ChargedProp, TChargedProp, MScalar);
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalar_ChargedProp_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.cc b/Hadrons/Modules/MScalar/FreeProp.cc
similarity index 62%
rename from extras/Hadrons/Modules/MScalar/FreeProp.cc
rename to Hadrons/Modules/MScalar/FreeProp.cc
index 924db288..4b25fbd0 100644
--- a/extras/Hadrons/Modules/MScalar/FreeProp.cc
+++ b/Hadrons/Modules/MScalar/FreeProp.cc
@@ -1,5 +1,32 @@
-#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
-#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/FreeProp.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalar/FreeProp.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -56,8 +83,6 @@ void TFreeProp::execute(void)
     
     if (!par().output.empty())
     {
-        TextWriter            writer(par().output + "." +
-                                     std::to_string(vm().getTrajectory()));
         std::vector<TComplex> buf;
         std::vector<Complex>  result;
         
@@ -67,6 +92,6 @@ void TFreeProp::execute(void)
         {
             result[t] = TensorRemove(buf[t]);
         }
-        write(writer, "prop", result);
+        saveResult(par().output, "freeprop", result);
     }
 }
diff --git a/extras/Hadrons/Modules/MGauge/Load.hpp b/Hadrons/Modules/MScalar/FreeProp.hpp
similarity index 63%
rename from extras/Hadrons/Modules/MGauge/Load.hpp
rename to Hadrons/Modules/MScalar/FreeProp.hpp
index a338af79..5ed6756d 100644
--- a/extras/Hadrons/Modules/MGauge/Load.hpp
+++ b/Hadrons/Modules/MScalar/FreeProp.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MGauge/Load.hpp
+Source file: Hadrons/Modules/MScalar/FreeProp.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -26,35 +25,38 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#ifndef Hadrons_MScalar_FreeProp_hpp_
+#define Hadrons_MScalar_FreeProp_hpp_
 
-#ifndef Hadrons_MGauge_Load_hpp_
-#define Hadrons_MGauge_Load_hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
 /******************************************************************************
- *                         Load a NERSC configuration                         *
+ *                               FreeProp                                     *
  ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MGauge)
+BEGIN_MODULE_NAMESPACE(MScalar)
 
-class LoadPar: Serializable
+class FreePropPar: Serializable
 {
 public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadPar,
-                                    std::string, file);
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
+                                    std::string, source,
+                                    double,      mass,
+                                    std::string, output);
 };
 
-class TLoad: public Module<LoadPar>
+class TFreeProp: public Module<FreePropPar>
 {
+public:
+    BASIC_TYPE_ALIASES(SIMPL,);
 public:
     // constructor
-    TLoad(const std::string name);
+    TFreeProp(const std::string name);
     // destructor
-    virtual ~TLoad(void) = default;
+    virtual ~TFreeProp(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -63,12 +65,15 @@ protected:
     virtual void setup(void);
     // execution
     virtual void execute(void);
+private:
+    std::string freeMomPropName_;
+    bool        freePropDone_;
 };
 
-MODULE_REGISTER_NS(Load, TLoad, MGauge);
+MODULE_REGISTER(FreeProp, TFreeProp, MScalar);
 
 END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_MGauge_Load_hpp_
+#endif // Hadrons_MScalar_FreeProp_hpp_
diff --git a/Hadrons/Modules/MScalar/Scalar.hpp b/Hadrons/Modules/MScalar/Scalar.hpp
new file mode 100644
index 00000000..afdb1713
--- /dev/null
+++ b/Hadrons/Modules/MScalar/Scalar.hpp
@@ -0,0 +1,33 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/Scalar.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_Scalar_hpp_
+#define Hadrons_Scalar_hpp_
+
+#define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
+
+#endif // Hadrons_Scalar_hpp_
diff --git a/Hadrons/Modules/MScalar/ScalarVP.cc b/Hadrons/Modules/MScalar/ScalarVP.cc
new file mode 100644
index 00000000..9b56d22b
--- /dev/null
+++ b/Hadrons/Modules/MScalar/ScalarVP.cc
@@ -0,0 +1,564 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/ScalarVP.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Hadrons/Modules/MScalar/ScalarVP.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalar;
+
+/*
+ * Scalar QED vacuum polarisation up to O(alpha)
+ *
+ * Conserved vector 2-point function diagram notation:
+ *        _______
+ *       /       \
+ * U_nu *         * U_mu
+ *       \_______/
+ *
+ *                (   adj(S(a\hat{nu}|x)) U_mu(x) S(0|x+a\hat{mu}) U_nu(0)    )
+ *          = 2 Re(                             -                             )
+ *                ( adj(S(a\hat{nu}|x+a\hat{mu})) adj(U_mu(x)) S(0|x) U_nu(0) )
+ *  
+ *
+ *            _______
+ *           /       \
+ * free = 1 *         * 1
+ *           \_______/
+ *
+ *
+ *
+ *             _______
+ *            /       \
+ * S = iA_nu *         * iA_mu
+ *            \_______/
+ *
+ *
+ *         Delta_1
+ *         ___*___
+ *        /       \
+ * X = 1 *         * 1
+ *        \___*___/
+ *         Delta_1
+ *
+ *          Delta_1                     Delta_1
+ *          ___*___                     ___*___
+ *         /       \                   /       \
+ *      1 *         * iA_mu  +  iA_nu *         * 1
+ *         \_______/                   \_______/
+ * 4C =        _______                     _______
+ *            /       \                   /       \
+ *      +  1 *         * iA_mu  +  iA_nu *         * 1
+ *            \___*___/                   \___*___/
+ *             Delta_1                     Delta_1
+ *
+ *     Delta_1   Delta_1
+ *          _*___*_             _______
+ *         /       \           /       \
+ * 2E = 1 *         * 1  +  1 *         * 1
+ *         \_______/           \_*___*_/
+ *                         Delta_1   Delta_1
+ *
+ *          Delta_2
+ *          ___*___             _______
+ *         /       \           /       \
+ * 2T = 1 *         * 1  +  1 *         * 1
+ *         \_______/           \___*___/
+ *                              Delta_2
+ *
+ *
+ *                    _______
+ *                   /       \
+ * srcT = -A_nu^2/2 *         * 1
+ *                   \_______/
+ *
+ *
+ *
+ *            _______
+ *           /       \
+ * snkT = 1 *         * -A_mu^2/2
+ *           \_______/
+ *
+ * Full VP to O(alpha) = free + q^2*(S+X+4C+2E+2T+srcT+snkT)
+ */
+
+/******************************************************************************
+*                  TScalarVP implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+TScalarVP::TScalarVP(const std::string name)
+: Module<ScalarVPPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+std::vector<std::string> TScalarVP::getInput(void)
+{
+    prop0Name_ = par().scalarProp + "_0";
+    propQName_ = par().scalarProp + "_Q";
+    propSunName_ = par().scalarProp + "_Sun";
+    propTadName_ = par().scalarProp + "_Tad";
+
+	std::vector<std::string> in = {par().emField, prop0Name_, propQName_,
+                                   propSunName_, propTadName_};
+    
+    return in;
+}
+
+std::vector<std::string> TScalarVP::getOutput(void)
+{
+    std::vector<std::string> out;
+    
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        // out.push_back(getName() + "_propQ_" + std::to_string(mu));
+
+        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+        {
+            out.push_back(getName() + "_" + std::to_string(mu)
+                          + "_" + std::to_string(nu));
+        }
+    }
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+void TScalarVP::setup(void)
+{
+	freeMomPropName_ = FREEMOMPROP(static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass);
+	GFSrcName_ = par().scalarProp + "_DinvSrc";
+    fftName_   = par().scalarProp + "_fft";
+	phaseName_.clear();
+	muPropQName_.clear();
+    vpTensorName_.clear();
+    momPhaseName_.clear();
+	for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
+        muPropQName_.push_back(getName() + "_propQ_" + std::to_string(mu));
+
+        std::vector<std::string> vpTensorName_mu;
+        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+        {
+            vpTensorName_mu.push_back(getName() + "_" + std::to_string(mu)
+                                      + "_" + std::to_string(nu));
+        }
+        vpTensorName_.push_back(vpTensorName_mu);
+    }
+    if (!par().output.empty())
+    {
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
+        }
+    }
+
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+	{
+	    envCreateLat(ScalarField, muPropQName_[mu]);
+
+        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+        {
+            envCreateLat(ScalarField, vpTensorName_[mu][nu]);
+        }
+	}
+    if (!par().output.empty())
+    {
+        momPhasesDone_ = env().hasCreatedObject(momPhaseName_[0]);
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            envCacheLat(ScalarField, momPhaseName_[i_p]);
+        }
+    }
+    envTmpLat(ScalarField, "buf");
+    envTmpLat(ScalarField, "result");
+    envTmpLat(ScalarField, "Amu");
+    envTmpLat(ScalarField, "Usnk");
+    envTmpLat(ScalarField, "tmpProp");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+void TScalarVP::execute(void)
+{
+    // CACHING ANALYTIC EXPRESSIONS
+    makeCaches();
+
+    Complex ci(0.0,1.0);
+    Real    q        = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().charge;
+    auto    &prop0   = envGet(ScalarField, prop0Name_);
+    auto    &propQ   = envGet(ScalarField, propQName_);
+    auto    &propSun = envGet(ScalarField, propSunName_);
+    auto    &propTad = envGet(ScalarField, propTadName_);
+    auto    &GFSrc   = envGet(ScalarField, GFSrcName_);
+    auto    &G       = envGet(ScalarField, freeMomPropName_);
+    auto    &fft     = envGet(FFT, fftName_);
+    phase_.clear();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        auto &phmu = envGet(ScalarField, phaseName_[mu]);
+        phase_.push_back(&phmu);
+    }
+    
+    // PROPAGATORS FROM SHIFTED SOURCES
+    LOG(Message) << "Computing O(q) charged scalar propagators..."
+                 << std::endl;
+    std::vector<ScalarField *> muPropQ;
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        auto &propmu = envGet(ScalarField, muPropQName_[mu]);
+
+        // -G*momD1*G*F*tau_mu*Src (momD1 = F*D1*Finv)
+        propmu = adj(*phase_[mu])*GFSrc;
+        momD1(propmu, fft);
+        propmu = -G*propmu;
+        fft.FFT_all_dim(propmu, propmu, FFT::backward);
+
+        muPropQ.push_back(&propmu);
+    }
+
+    // CONTRACTIONS
+    auto        &A = envGet(EmField, par().emField);
+    envGetTmp(ScalarField, buf);
+    envGetTmp(ScalarField, result);
+    envGetTmp(ScalarField, Amu);
+    envGetTmp(ScalarField, Usnk);
+    envGetTmp(ScalarField, tmpProp);
+    TComplex    Anu0, Usrc;
+    std::vector<int> coor0 = {0, 0, 0, 0};
+    std::vector<std::vector<ScalarField *> > vpTensor;
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        std::vector<ScalarField *> vpTensor_mu;
+        for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+        {
+            auto &vpmunu = envGet(ScalarField, vpTensorName_[mu][nu]);
+            vpTensor_mu.push_back(&vpmunu);
+        }
+        vpTensor.push_back(vpTensor_mu);
+    }
+
+    // Prepare output data structure if necessary
+    Result outputData;
+    if (!par().output.empty())
+    {
+        outputData.projection.resize(par().outputMom.size());
+        outputData.lattice_size = env().getGrid()->FullDimensions().toVector();
+        outputData.mass = static_cast<TChargedProp *>(vm().getModule(par().scalarProp))->par().mass;
+        outputData.charge = q;
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
+            outputData.projection[i_p].pi.resize(env().getNd());
+            outputData.projection[i_p].pi_free.resize(env().getNd());
+            outputData.projection[i_p].pi_2E.resize(env().getNd());
+            outputData.projection[i_p].pi_2T.resize(env().getNd());
+            outputData.projection[i_p].pi_S.resize(env().getNd());
+            outputData.projection[i_p].pi_4C.resize(env().getNd());
+            outputData.projection[i_p].pi_X.resize(env().getNd());
+            outputData.projection[i_p].pi_srcT.resize(env().getNd());
+            outputData.projection[i_p].pi_snkT.resize(env().getNd());
+            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+            {
+                outputData.projection[i_p].pi[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_free[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_2E[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_2T[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_S[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_4C[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_X[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_srcT[nu].resize(env().getNd());
+                outputData.projection[i_p].pi_snkT[nu].resize(env().getNd());
+            }
+        }
+    }
+
+    // Do contractions
+    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+    {
+        peekSite(Anu0, peekLorentz(A, nu), coor0);
+
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            LOG(Message) << "Computing Pi[" << mu << "][" << nu << "]..."
+                         << std::endl;
+            Amu = peekLorentz(A, mu);
+
+            // free
+            tmpProp = Cshift(prop0, nu, -1);     // S_0(0|x-a\hat{\nu})
+                                                 // = S_0(a\hat{\nu}|x)
+            Usrc    = Complex(1.0,0.0);
+            vpContraction(result, prop0, tmpProp, Usrc, mu);
+            *vpTensor[mu][nu] = result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_free[mu][nu], result,
+                            i_p);
+                }
+            }
+            tmpProp = result; // Just using tmpProp as a temporary ScalarField
+                              // here (buf is modified by calls to writeVP())
+
+            // srcT
+            result = tmpProp * (-0.5)*Anu0*Anu0;
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_srcT[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // snkT
+            result = tmpProp * (-0.5)*Amu*Amu;
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_snkT[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // S
+            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
+            Usrc    = ci*Anu0;
+            Usnk    = ci*Amu;
+            vpContraction(result, prop0, tmpProp, Usrc, Usnk, mu);
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_S[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // 4C
+            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
+            Usrc    = Complex(1.0,0.0);
+            Usnk    = ci*Amu;
+            vpContraction(result, propQ, tmpProp, Usrc, Usnk, mu);
+            Usrc    = ci*Anu0;
+            vpContraction(buf, propQ, tmpProp, Usrc, mu);
+            result += buf;
+            vpContraction(buf, prop0, *muPropQ[nu], Usrc, mu);
+            result += buf;
+            Usrc = Complex(1.0,0.0);
+            Usnk = ci*Amu;
+            vpContraction(buf, prop0, *muPropQ[nu], Usrc, Usnk, mu);
+            result += buf;
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_4C[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // X
+            Usrc = Complex(1.0,0.0);
+            vpContraction(result, propQ, *muPropQ[nu], Usrc, mu);
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_X[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // 2E
+            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
+            Usrc    = Complex(1.0,0.0);
+            vpContraction(result, propSun, tmpProp, Usrc, mu);
+            tmpProp = Cshift(propSun, nu, -1);     // S_\Sigma(0|x-a\hat{\nu})
+                               //(Note: <S(0|x-a\hat{\nu})> = <S(a\hat{\nu}|x)>)
+            vpContraction(buf, prop0, tmpProp, Usrc, mu);
+            result += buf;
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_2E[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // 2T
+            tmpProp = Cshift(prop0, nu, -1);     // S_0(a\hat{\nu}|x)
+            Usrc    = Complex(1.0,0.0);
+            vpContraction(result, propTad, tmpProp, Usrc, mu);
+            tmpProp = Cshift(propTad, nu, -1);     // S_T(0|x-a\hat{\nu})
+            vpContraction(buf, prop0, tmpProp, Usrc, mu);
+            result += buf;
+            *vpTensor[mu][nu] += q*q*result;
+            // Do momentum projections if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi_2T[mu][nu], result,
+                            i_p);
+                }
+            }
+
+            // Do momentum projections of full VP if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pi[mu][nu],
+                            *vpTensor[mu][nu], i_p);
+                }
+            }
+        }
+    }
+
+    // OUTPUT IF NECESSARY
+    if (!par().output.empty())
+    {
+        LOG(Message) << "Saving momentum-projected HVP to '"
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
+                     << std::endl;
+        saveResult(par().output, "HVP", outputData);
+    }
+}
+
+void TScalarVP::makeCaches(void)
+{
+    envGetTmp(ScalarField, buf);
+
+    if ( (!par().output.empty()) && (!momPhasesDone_) )
+    {
+        LOG(Message) << "Caching phases for momentum projections..."
+                     << std::endl;
+        auto l = env().getGrid()->FullDimensions();
+        Complex          ci(0.0,1.0);
+
+        // Calculate phase factors
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            std::vector<int> mom = strToVec<int>(par().outputMom[i_p]);
+            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
+            momph_ip = Zero();
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                Real twoPiL = M_PI*2./l[j];
+                LatticeCoordinate(buf, j);
+                buf = mom[j]*twoPiL*buf;
+                momph_ip = momph_ip + buf;
+            }
+            momph_ip = exp(-ci*momph_ip);
+            momPhase_.push_back(&momph_ip);
+        }
+    }
+}
+
+void TScalarVP::vpContraction(ScalarField &vp,
+                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
+                   TComplex u_src, ScalarField &u_snk, int mu)
+{
+    // Note: this function assumes a point source is used.
+    vp = adj(prop_nu_x) * u_snk * Cshift(prop_0_x, mu, 1) * u_src;
+    vp -= Cshift(adj(prop_nu_x), mu, 1) * adj(u_snk) * prop_0_x * u_src;
+    vp = 2.0*real(vp);
+}
+
+void TScalarVP::vpContraction(ScalarField &vp,
+                   ScalarField &prop_0_x, ScalarField &prop_nu_x,
+                   TComplex u_src, int mu)
+{
+    // Note: this function assumes a point source is used.
+    vp = adj(prop_nu_x) * Cshift(prop_0_x, mu, 1) * u_src;
+    vp -= Cshift(adj(prop_nu_x), mu, 1) * prop_0_x * u_src;
+    vp = 2.0*real(vp);
+}
+
+void TScalarVP::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
+{
+    std::vector<TComplex>   vecBuf;
+    envGetTmp(ScalarField, buf);
+
+    buf = vp*(*momPhase_[i_p]);
+    sliceSum(buf, vecBuf, Tp);
+    projection.resize(vecBuf.size());
+    for (unsigned int t = 0; t < vecBuf.size(); ++t)
+    {
+        projection[t] = TensorRemove(vecBuf[t]);
+    }
+}
+
+void TScalarVP::momD1(ScalarField &s, FFT &fft)
+{
+    auto        &A = envGet(EmField, par().emField);
+    Complex     ci(0.0,1.0);
+
+    envGetTmp(ScalarField, buf);
+    envGetTmp(ScalarField, result);
+    envGetTmp(ScalarField, Amu);
+
+    result = Zero();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = (*phase_[mu])*s;
+        fft.FFT_all_dim(buf, buf, FFT::backward);
+        buf = Amu*buf;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result - ci*buf;
+    }
+    fft.FFT_all_dim(s, s, FFT::backward);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Amu = peekLorentz(A, mu);
+        buf = Amu*s;
+        fft.FFT_all_dim(buf, buf, FFT::forward);
+        result = result + ci*adj(*phase_[mu])*buf;
+    }
+
+    s = result;
+}
diff --git a/Hadrons/Modules/MScalar/ScalarVP.hpp b/Hadrons/Modules/MScalar/ScalarVP.hpp
new file mode 100644
index 00000000..3cd01249
--- /dev/null
+++ b/Hadrons/Modules/MScalar/ScalarVP.hpp
@@ -0,0 +1,129 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/ScalarVP.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalar_ScalarVP_hpp_
+#define Hadrons_MScalar_ScalarVP_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Scalar vacuum polarisation                         *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalar)
+
+class ScalarVPPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarVPPar,
+                                    std::string, emField,
+                                    std::string, scalarProp,
+                                    std::string, output,
+                                    std::vector<std::string>, outputMom);
+};
+
+class TScalarVP: public Module<ScalarVPPar>
+{
+public:
+    BASIC_TYPE_ALIASES(SIMPL,);
+    typedef PhotonR::GaugeField     EmField;
+    typedef PhotonR::GaugeLinkField EmComp;
+    class Result: Serializable
+    {
+    public:
+        class Projection: Serializable
+        {
+        public:
+            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
+                                            std::vector<int>,     momentum,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_free,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_2E,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_2T,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_S,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_4C,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_X,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_srcT,
+                                            std::vector<std::vector<std::vector<Complex>>>, pi_snkT);
+        };
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<int>,        lattice_size,
+                                        double,                  mass,
+                                        double,                  charge,
+                                        std::vector<Projection>, projection);
+    };
+public:
+    // constructor
+    TScalarVP(const std::string name);
+    // destructor
+    virtual ~TScalarVP(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    void makeCaches(void);
+    // conserved vector two-point contraction
+    void vpContraction(ScalarField &vp,
+                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
+                       TComplex u_src, ScalarField &u_snk, int mu);
+    // conserved vector two-point contraction with unit gauge link at sink
+    void vpContraction(ScalarField &vp,
+                       ScalarField &prop_0_x, ScalarField &prop_nu_x,
+                       TComplex u_src, int mu);
+    // write momentum-projected vacuum polarisation to file(s)
+    void project(std::vector<Complex> &projection, const ScalarField &vp,
+                 int i_p);
+    // momentum-space Delta_1 insertion
+    void momD1(ScalarField &s, FFT &fft);
+private:
+    bool                                        momPhasesDone_;
+    std::string                                 freeMomPropName_, GFSrcName_,
+                                                prop0Name_, propQName_,
+                                                propSunName_, propTadName_,
+                                                fftName_;
+    std::vector<std::string>                    phaseName_, muPropQName_,
+                                                momPhaseName_;
+    std::vector<std::vector<std::string> >      vpTensorName_;
+    std::vector<ScalarField *>                  phase_, momPhase_;
+};
+
+MODULE_REGISTER(ScalarVP, TScalarVP, MScalar);
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalar_ScalarVP_hpp_
diff --git a/Hadrons/Modules/MScalar/VPCounterTerms.cc b/Hadrons/Modules/MScalar/VPCounterTerms.cc
new file mode 100644
index 00000000..45956932
--- /dev/null
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.cc
@@ -0,0 +1,260 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/VPCounterTerms.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalar/VPCounterTerms.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalar;
+
+/******************************************************************************
+*                  TVPCounterTerms implementation                             *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+TVPCounterTerms::TVPCounterTerms(const std::string name)
+: Module<VPCounterTermsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+std::vector<std::string> TVPCounterTerms::getInput(void)
+{
+    std::vector<std::string> in = {par().source};
+    
+    return in;
+}
+
+std::vector<std::string> TVPCounterTerms::getOutput(void)
+{
+    std::vector<std::string> out;
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+void TVPCounterTerms::setup(void)
+{
+	freeMomPropName_ = FREEMOMPROP(par().mass);
+    phaseName_.clear();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
+    }
+    GFSrcName_ = getName() + "_DinvSrc";
+    phatsqName_ = getName() + "_pHatSquared";
+    prop0Name_ = getName() + "_freeProp";
+    twoscalarName_ = getName() + "_2scalarProp";
+    psquaredName_ = getName() + "_psquaredProp";
+    if (!par().output.empty())
+    {
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            momPhaseName_.push_back("_momentumphase_" + std::to_string(i_p));
+        }
+    }
+
+    envCreateLat(ScalarField, freeMomPropName_);
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        envCreateLat(ScalarField, phaseName_[mu]);
+    }
+    envCreateLat(ScalarField, phatsqName_);
+    envCreateLat(ScalarField, GFSrcName_);
+    envCreateLat(ScalarField, prop0Name_);
+    envCreateLat(ScalarField, twoscalarName_);
+    envCreateLat(ScalarField, psquaredName_);
+    if (!par().output.empty())
+    {
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            envCacheLat(ScalarField, momPhaseName_[i_p]);
+        }
+    }
+    envTmpLat(ScalarField, "buf");
+    envTmpLat(ScalarField, "tmp_vp");
+    envTmpLat(ScalarField, "vpPhase");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+void TVPCounterTerms::execute(void)
+{
+	auto &source = envGet(ScalarField, par().source);
+    Complex     ci(0.0,1.0);
+    FFT         fft(env().getGrid());
+    envGetTmp(ScalarField, buf);
+    envGetTmp(ScalarField, tmp_vp);
+    
+    // Momentum-space free scalar propagator
+    auto &G = envGet(ScalarField, freeMomPropName_);
+    SIMPL::MomentumSpacePropagator(G, par().mass);
+
+    // Phases and hat{p}^2
+    auto &phatsq = envGet(ScalarField, phatsqName_);
+    Coordinate l = env().getGrid()->FullDimensions();
+    
+    LOG(Message) << "Calculating shift phases..." << std::endl;
+    phatsq = Zero();
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    {
+        Real    twoPiL = M_PI*2./l[mu];
+        auto &phmu  = envGet(ScalarField, phaseName_[mu]);
+
+        LatticeCoordinate(buf, mu);
+        phmu = exp(ci*twoPiL*buf);
+        phase_.push_back(&phmu);
+        buf = 2.*sin(.5*twoPiL*buf);
+		phatsq = phatsq + buf*buf;
+    }
+
+    // G*F*src
+    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
+    fft.FFT_all_dim(GFSrc, source, FFT::forward);
+    GFSrc = G*GFSrc;
+
+    // Position-space free scalar propagator
+    auto &prop0       = envGet(ScalarField, prop0Name_);
+    prop0 = GFSrc;
+    fft.FFT_all_dim(prop0, prop0, FFT::backward);
+
+    // Propagators for counter-terms
+    auto &twoscalarProp        = envGet(ScalarField, twoscalarName_);
+    auto &psquaredProp         = envGet(ScalarField, psquaredName_);
+
+    twoscalarProp = G*GFSrc;
+    fft.FFT_all_dim(twoscalarProp, twoscalarProp, FFT::backward);
+
+    psquaredProp = G*phatsq*GFSrc;
+    fft.FFT_all_dim(psquaredProp, psquaredProp, FFT::backward);
+
+    // Prepare output data structure if necessary
+    Result outputData;
+    if (!par().output.empty())
+    {
+        outputData.projection.resize(par().outputMom.size());
+        outputData.lattice_size = env().getGrid()->FullDimensions().toVector();
+        outputData.mass = par().mass;
+        for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+        {
+            outputData.projection[i_p].momentum = strToVec<int>(par().outputMom[i_p]);
+            outputData.projection[i_p].twoScalar.resize(env().getNd());
+            outputData.projection[i_p].threeScalar.resize(env().getNd());
+            outputData.projection[i_p].pSquaredInsertion.resize(env().getNd());
+            for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+            {
+                outputData.projection[i_p].twoScalar[nu].resize(env().getNd());
+                outputData.projection[i_p].threeScalar[nu].resize(env().getNd());
+                outputData.projection[i_p].pSquaredInsertion[nu].resize(env().getNd());
+            }
+            // Calculate phase factors
+            auto &momph_ip = envGet(ScalarField, momPhaseName_[i_p]);
+            momph_ip = Zero();
+            for (unsigned int j = 0; j < env().getNd()-1; ++j)
+            {
+                Real twoPiL = M_PI*2./l[j];
+                LatticeCoordinate(buf, j);
+                buf = outputData.projection[i_p].momentum[j]*twoPiL*buf;
+                momph_ip = momph_ip + buf;
+            }
+            momph_ip = exp(-ci*momph_ip);
+            momPhase_.push_back(&momph_ip);
+        }
+    }
+
+    // Contractions
+    for (unsigned int nu = 0; nu < env().getNd(); ++nu)
+    {
+    	buf = adj(Cshift(prop0, nu, -1));
+        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+        {
+            // Two-scalar loop
+            tmp_vp = buf * Cshift(prop0, mu, 1);
+            tmp_vp -= Cshift(buf, mu, 1) * prop0;
+            tmp_vp = 2.0*real(tmp_vp);
+            // Output if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].twoScalar[mu][nu],
+                            tmp_vp, i_p);
+                }
+            }
+
+        	// Three-scalar loop (no vertex)
+    		tmp_vp = buf * Cshift(twoscalarProp, mu, 1);
+            tmp_vp -= Cshift(buf, mu, 1) * twoscalarProp;
+            tmp_vp = 2.0*real(tmp_vp);
+            // Output if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].threeScalar[mu][nu],
+                            tmp_vp, i_p);
+                }
+            }
+
+            // Three-scalar loop (hat{p}^2 insertion)
+    		tmp_vp = buf * Cshift(psquaredProp, mu, 1);
+            tmp_vp -= Cshift(buf, mu, 1) * psquaredProp;
+            tmp_vp = 2.0*real(tmp_vp);
+            // Output if necessary
+            if (!par().output.empty())
+            {
+                for (unsigned int i_p = 0; i_p < par().outputMom.size(); ++i_p)
+                {
+                    project(outputData.projection[i_p].pSquaredInsertion[mu][nu],
+                            tmp_vp, i_p);
+                }
+            }
+        }
+    }
+
+    // OUTPUT IF NECESSARY
+    if (!par().output.empty())
+    {
+        LOG(Message) << "Saving momentum-projected correlators to '"
+                     << RESULT_FILE_NAME(par().output, vm().getTrajectory()) << "'..."
+                     << std::endl;
+        saveResult(par().output, "scalar_loops", outputData);
+    }
+}
+
+void TVPCounterTerms::project(std::vector<Complex> &projection, const ScalarField &vp, int i_p)
+{
+    std::vector<TComplex>   vecBuf;
+    envGetTmp(ScalarField, vpPhase);
+
+    vpPhase = vp*(*momPhase_[i_p]);
+    sliceSum(vpPhase, vecBuf, Tp);
+    projection.resize(vecBuf.size());
+    for (unsigned int t = 0; t < vecBuf.size(); ++t)
+    {
+        projection[t] = TensorRemove(vecBuf[t]);
+    }
+}
diff --git a/Hadrons/Modules/MScalar/VPCounterTerms.hpp b/Hadrons/Modules/MScalar/VPCounterTerms.hpp
new file mode 100644
index 00000000..8bbc3e20
--- /dev/null
+++ b/Hadrons/Modules/MScalar/VPCounterTerms.hpp
@@ -0,0 +1,103 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalar/VPCounterTerms.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: James Harrison <jch1g10@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalar_VPCounterTerms_hpp_
+#define Hadrons_MScalar_VPCounterTerms_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         VPCounterTerms                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalar)
+
+class VPCounterTermsPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(VPCounterTermsPar,
+                                    std::string, source,
+                                    double,      mass,
+                                    std::string, output,
+                                    std::vector<std::string>, outputMom);
+};
+
+class TVPCounterTerms: public Module<VPCounterTermsPar>
+{
+public:
+    BASIC_TYPE_ALIASES(SIMPL,);
+    class Result: Serializable
+    {
+    public:
+        class Projection: Serializable
+        {
+        public:
+            GRID_SERIALIZABLE_CLASS_MEMBERS(Projection,
+                                            std::vector<int>,     momentum,
+                                            std::vector<std::vector<std::vector<Complex>>>, twoScalar,
+                                            std::vector<std::vector<std::vector<Complex>>>, threeScalar,
+                                            std::vector<std::vector<std::vector<Complex>>>, pSquaredInsertion);
+        };
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<int>,        lattice_size,
+                                        double,                  mass,
+                                        std::vector<Projection>, projection);
+    };
+public:
+    // constructor
+    TVPCounterTerms(const std::string name);
+    // destructor
+    virtual ~TVPCounterTerms(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    void project(std::vector<Complex> &projection, const ScalarField &vp, int i_p);
+private:
+    std::string                freeMomPropName_, GFSrcName_, phatsqName_, prop0Name_,
+                               twoscalarName_, twoscalarVertexName_,
+                               psquaredName_, psquaredVertexName_;
+    std::vector<std::string>   phaseName_, momPhaseName_;
+    std::vector<ScalarField *> phase_, momPhase_;
+};
+
+MODULE_REGISTER(VPCounterTerms, TVPCounterTerms, MScalar);
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalar_VPCounterTerms_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/Div.cc b/Hadrons/Modules/MScalarSUN/Div.cc
new file mode 100644
index 00000000..c5d67579
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/Div.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/Div.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/Div.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TDiv<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TDiv<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TDiv<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TDiv<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TDiv<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/Div.hpp b/Hadrons/Modules/MScalarSUN/Div.hpp
new file mode 100644
index 00000000..9980d0f6
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/Div.hpp
@@ -0,0 +1,155 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/Div.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_Div_hpp_
+#define Hadrons_MScalarSUN_Div_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Divergence of a vector field                         *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class DivPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(DivPar,
+                                    std::vector<std::string>, op,
+                                    DiffType,                 type,
+                                    std::string,              output);
+};
+
+class DivResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(DivResult,
+                                    DiffType, type,
+                                    Complex,  value);
+};
+
+template <typename SImpl>
+class TDiv: public Module<DivPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+public:
+    // constructor
+    TDiv(const std::string name);
+    // destructor
+    virtual ~TDiv(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(DivSU2, TDiv<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(DivSU3, TDiv<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(DivSU4, TDiv<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(DivSU5, TDiv<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(DivSU6, TDiv<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                           TDiv implementation                              *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TDiv<SImpl>::TDiv(const std::string name)
+: Module<DivPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TDiv<SImpl>::getInput(void)
+{
+    return par().op;
+}
+
+template <typename SImpl>
+std::vector<std::string> TDiv<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TDiv<SImpl>::setup(void)
+{
+    if (par().op.size() != env().getNd())
+    {
+        HADRONS_ERROR(Size, "the number of components differs from number of dimensions");
+    }
+    envCreateLat(ComplexField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TDiv<SImpl>::execute(void)
+{
+    const auto nd = env().getNd();
+
+    LOG(Message) << "Computing the " << par().type << " divergence of [";
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        std::cout << "'" << par().op[mu] << ((mu == nd - 1) ? "']" : "', ");
+    }
+    std::cout << std::endl;
+
+    auto &div = envGet(ComplexField, getName());
+    div = Zero();
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        auto &op = envGet(ComplexField, par().op[mu]);
+        dmuAcc(div, op, mu, par().type);
+    }
+    if (!par().output.empty())
+    {
+        DivResult r;
+
+        r.type  = par().type;
+        r.value = TensorRemove(sum(div));
+        saveResult(par().output, "div", r);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_Div_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/EMT.cc b/Hadrons/Modules/MScalarSUN/EMT.cc
new file mode 100644
index 00000000..08ba2a83
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/EMT.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/EMT.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TEMT<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TEMT<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TEMT<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TEMT<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TEMT<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/EMT.hpp b/Hadrons/Modules/MScalarSUN/EMT.hpp
new file mode 100644
index 00000000..c5fd10a5
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/EMT.hpp
@@ -0,0 +1,217 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/EMT.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_EMT_hpp_
+#define Hadrons_MScalarSUN_EMT_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Energy-momentum tensor                             *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class EMTPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(EMTPar,
+                                    std::string, kinetic,
+                                    std::string, phiPow,
+                                    std::string, improvement,
+                                    double     , m2,
+                                    double     , lambda,
+                                    double     , g,
+                                    double     , xi,
+                                    std::string, output);
+};
+
+class EMTResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(EMTResult,
+                                    std::vector<std::vector<Complex>>, value,
+                                    double,                            m2,
+                                    double,                            lambda,
+                                    double,                            g,
+                                    double,                            xi);
+};
+
+template <typename SImpl>
+class TEMT: public Module<EMTPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+public:
+    // constructor
+    TEMT(const std::string name);
+    // destructor
+    virtual ~TEMT(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(EMTSU2, TEMT<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(EMTSU3, TEMT<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(EMTSU4, TEMT<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(EMTSU5, TEMT<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(EMTSU6, TEMT<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                           TEMT implementation                              *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TEMT<SImpl>::TEMT(const std::string name)
+: Module<EMTPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TEMT<SImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        in.push_back(varName(par().kinetic, mu, nu));
+        if (!par().improvement.empty())
+        {
+            in.push_back(varName(par().improvement, mu, nu));
+        }
+    }
+    in.push_back(varName(par().kinetic, "sum"));
+    in.push_back(varName(par().phiPow, 2));
+    in.push_back(varName(par().phiPow, 4));
+
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TEMT<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out;
+    
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        out.push_back(varName(getName(), mu, nu));
+    }
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TEMT<SImpl>::setup(void)
+{
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        envCreateLat(ComplexField, varName(getName(), mu, nu));
+    }
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TEMT<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing energy-momentum tensor" << std::endl;
+    LOG(Message) << "  kinetic terms: '" << par().kinetic << "'" << std::endl;
+    LOG(Message) << "      tr(phi^n): '" << par().phiPow << "'" << std::endl;
+    if (!par().improvement.empty())
+    {
+        LOG(Message) << "    improvement: '" << par().improvement << "'" << std::endl;
+    }
+    LOG(Message) << "            m^2= " << par().m2 << std::endl;
+    LOG(Message) << "         lambda= " << par().lambda << std::endl;
+    LOG(Message) << "              g= " << par().g << std::endl;
+    if (!par().improvement.empty())
+    {
+        LOG(Message) << "             xi= " << par().xi << std::endl;
+    }
+
+    const unsigned int N = SImpl::Group::Dimension, nd = env().getNd();
+    auto               &trphi2 = envGet(ComplexField, varName(par().phiPow, 2));
+    auto               &trphi4 = envGet(ComplexField, varName(par().phiPow, 4));
+    auto               &sumkin = envGet(ComplexField, varName(par().kinetic, "sum"));
+    EMTResult          result;
+
+    if (!par().output.empty())
+    {
+        result.m2     = par().m2;
+        result.g      = par().g;
+        result.lambda = par().lambda;
+        result.xi     = par().xi;
+        result.value.resize(nd, std::vector<Complex>(nd));
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    for (unsigned int nu = mu; nu < nd; ++nu)
+    {
+        auto &out   = envGet(ComplexField, varName(getName(), mu, nu));
+        auto &trkin = envGet(ComplexField, varName(par().kinetic, mu, nu));
+        
+        out = 2.*trkin;
+        if (!par().improvement.empty())
+        {
+            auto &imp = envGet(ComplexField, varName(par().improvement, mu, nu));
+
+            out += par().xi*imp;
+        }
+        if (mu == nu)
+        {
+            out -= sumkin + par().m2*trphi2 + par().lambda*trphi4;
+        }
+        out *= N/par().g;
+        if (!par().output.empty())
+        {
+            result.value[mu][nu] = TensorRemove(sum(out));
+            result.value[mu][nu] = result.value[nu][mu];
+        }
+    }
+    if (!par().output.empty())
+    {
+        saveResult(par().output, "emt", result);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_EMT_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/Grad.cc b/Hadrons/Modules/MScalarSUN/Grad.cc
new file mode 100644
index 00000000..15904af1
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/Grad.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/Grad.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TGrad<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TGrad<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TGrad<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TGrad<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TGrad<ScalarNxNAdjImplR<6>>;
diff --git a/Hadrons/Modules/MScalarSUN/Grad.hpp b/Hadrons/Modules/MScalarSUN/Grad.hpp
new file mode 100644
index 00000000..93a8ddda
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/Grad.hpp
@@ -0,0 +1,166 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/Grad.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_Grad_hpp_
+#define Hadrons_MScalarSUN_Grad_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Gradient of a complex field                          *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class GradPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GradPar,
+                                    std::string, op,
+                                    DiffType,    type,
+                                    std::string, output);
+};
+
+class GradResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GradResult,
+                                    DiffType,              type,
+                                    std::vector<Complex>,  value);
+};
+
+template <typename SImpl>
+class TGrad: public Module<GradPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+public:
+    // constructor
+    TGrad(const std::string name);
+    // destructor
+    virtual ~TGrad(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(GradSU2, TGrad<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(GradSU3, TGrad<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(GradSU4, TGrad<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(GradSU5, TGrad<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(GradSU6, TGrad<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                         TGrad implementation                               *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TGrad<SImpl>::TGrad(const std::string name)
+: Module<GradPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TGrad<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().op};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TGrad<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out;
+    const auto               nd = env().getNd();
+
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        out.push_back(varName(getName(), mu));
+    }
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TGrad<SImpl>::setup(void)
+{
+    const auto nd = env().getNd();
+
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        envCreateLat(ComplexField, varName(getName(), mu));
+    }
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TGrad<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing the " << par().type << " gradient of '"
+                 << par().op << "'" << std::endl;
+
+    const unsigned int nd = env().getNd();
+    GradResult         result;
+    auto               &op = envGet(ComplexField, par().op);
+
+    if (!par().output.empty())
+    {
+        result.type = par().type;
+        result.value.resize(nd);
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        auto &der = envGet(ComplexField, varName(getName(), mu));
+
+        dmu(der, op, mu, par().type);
+        if (!par().output.empty())
+        {
+            result.value[mu] = TensorRemove(sum(der));
+        }
+    }
+    if (!par().output.empty())
+    {
+        saveResult(par().output, "grad", result);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_Grad_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/ShiftProbe.cc b/Hadrons/Modules/MScalarSUN/ShiftProbe.cc
new file mode 100644
index 00000000..83454941
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/ShiftProbe.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/ShiftProbe.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/ShiftProbe.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TShiftProbe<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TShiftProbe<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TShiftProbe<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TShiftProbe<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TShiftProbe<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp b/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
new file mode 100644
index 00000000..dcd56252
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
@@ -0,0 +1,177 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_ShiftProbe_hpp_
+#define Hadrons_MScalarSUN_ShiftProbe_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *         Ward identity phi^n probe with fields at different positions       *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+typedef std::pair<int, int> ShiftPair;
+
+class ShiftProbePar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ShiftProbePar,
+                                    std::string, field,
+                                    std::string, shifts,
+                                    std::string, output);
+};
+
+class ShiftProbeResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ShiftProbeResult,
+                                    std::string, shifts,
+                                    Complex,     value);
+};
+
+template <typename SImpl>
+class TShiftProbe: public Module<ShiftProbePar>
+{
+public:
+    typedef typename SImpl::Field                          Field;
+    typedef typename SImpl::ComplexField                   ComplexField;
+public:
+    // constructor
+    TShiftProbe(const std::string name);
+    // destructor
+    virtual ~TShiftProbe(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(ShiftProbeSU2, TShiftProbe<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(ShiftProbeSU3, TShiftProbe<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(ShiftProbeSU4, TShiftProbe<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(ShiftProbeSU5, TShiftProbe<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(ShiftProbeSU6, TShiftProbe<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                        TShiftProbe implementation                          *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TShiftProbe<SImpl>::TShiftProbe(const std::string name)
+: Module<ShiftProbePar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TShiftProbe<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TShiftProbe<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TShiftProbe<SImpl>::setup(void)
+{
+    envTmpLat(Field, "acc");
+    envCreateLat(ComplexField, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TShiftProbe<SImpl>::execute(void)
+{
+    LOG(Message) << "Creating shift probe for shifts " << par().shifts
+                 << std::endl;
+
+    std::vector<ShiftPair> shift;
+    double                 sign;
+    auto                   &phi   = envGet(Field, par().field);
+    auto                   &probe = envGet(ComplexField, getName());
+
+    shift = strToVec<ShiftPair>(par().shifts);
+    if (shift.size() % 2 != 0)
+    {
+        HADRONS_ERROR(Size, "the number of shifts is odd");
+    }
+    sign = (shift.size() % 4 == 0) ? 1. : -1.;
+    for (auto &s: shift)
+    {
+        if (s.first >= env().getNd())
+        {
+            HADRONS_ERROR(Size, "dimension to large for shift <" 
+                               + std::to_string(s.first) + " " 
+                               + std::to_string(s.second) + ">" );
+        }
+    }
+    envGetTmp(Field, acc);
+    acc = 1.;
+    for (unsigned int i = 0; i < shift.size(); ++i)
+    {
+        if (shift[i].second == 0)
+        {
+            acc *= phi;
+        }
+        else
+        {
+            acc *= Cshift(phi, shift[i].first, shift[i].second);
+        }
+    }
+    probe = sign*trace(acc);
+    if (!par().output.empty())
+    {
+        ShiftProbeResult r;
+
+        r.shifts = par().shifts;
+        r.value  = TensorRemove(sum(probe));
+        saveResult(par().output, "probe", r);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_ShiftProbe_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/StochFreeField.cc b/Hadrons/Modules/MScalarSUN/StochFreeField.cc
new file mode 100644
index 00000000..70e8356d
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/StochFreeField.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/StochFreeField.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TStochFreeField<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TStochFreeField<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TStochFreeField<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TStochFreeField<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TStochFreeField<ScalarNxNAdjImplR<6>>;
diff --git a/Hadrons/Modules/MScalarSUN/StochFreeField.hpp b/Hadrons/Modules/MScalarSUN/StochFreeField.hpp
new file mode 100644
index 00000000..5ab260d3
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/StochFreeField.hpp
@@ -0,0 +1,178 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/StochFreeField.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_StochFreeField_hpp_
+#define Hadrons_MScalarSUN_StochFreeField_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                      stochastic free SU(N) scalar field                    *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class StochFreeFieldPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(StochFreeFieldPar,
+                                    double, m2,
+                                    double, g,
+                                    double, smearing);
+};
+
+template <typename SImpl>
+class TStochFreeField: public Module<StochFreeFieldPar>
+{
+public:
+    typedef typename SImpl::Field                    Field;
+    typedef typename SImpl::ComplexField             ComplexField;
+    typedef typename SImpl::Group                    Group;
+    typedef typename SImpl::SiteField::scalar_object Site;
+public:
+    // constructor
+    TStochFreeField(const std::string name);
+    // destructor
+    virtual ~TStochFreeField(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    bool create_weight;
+};
+
+MODULE_REGISTER_TMP(StochFreeFieldSU2, TStochFreeField<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(StochFreeFieldSU3, TStochFreeField<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(StochFreeFieldSU4, TStochFreeField<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(StochFreeFieldSU5, TStochFreeField<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(StochFreeFieldSU6, TStochFreeField<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                 TStochFreeField implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TStochFreeField<SImpl>::TStochFreeField(const std::string name)
+: Module<StochFreeFieldPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TStochFreeField<SImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TStochFreeField<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TStochFreeField<SImpl>::setup(void)
+{
+    create_weight = false; 
+    if (!env().hasCreatedObject("_" + getName() + "_weight"))
+    {
+        envCacheLat(ComplexField, "_" + getName() + "_weight");
+        envTmpLat(ComplexField, "smear");
+        create_weight = true;
+    }
+    envTmpLat(Field, "phift");
+    envTmpLat(ComplexField, "ca");
+    envCreateLat(Field, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TStochFreeField<SImpl>::execute(void)
+{
+    LOG(Message) << "Generating stochastic scalar field" << std::endl;
+    
+    const   unsigned int N    = Group::Dimension;
+    const   unsigned int Nadj = Group::AdjointDimension;
+    auto    &phi              = envGet(Field, getName());
+    auto    &w                = envGet(ComplexField, "_" + getName() + "_weight");
+    auto    &rng              = rng4d();
+    double  trphi2;
+    FFT     fft(envGetGrid(Field));
+    Integer vol;
+
+    vol = 1;
+    for(int d = 0; d < env().getNd(); d++)
+    {
+        vol = vol*env().getDim(d);
+    }
+    if (create_weight)
+    {
+        LOG(Message) << "Caching momentum-space scalar action" << std::endl;
+        
+        envGetTmp(ComplexField, smear);
+        SImpl::MomentaSquare(smear);
+        smear = exp(-par().smearing*smear);
+        SImpl::MomentumSpacePropagator(w, sqrt(par().m2));
+        w *= par().g/N*smear;
+        w  = sqrt(vol)*sqrt(w);
+    }
+    LOG(Message) << "Generating random momentum-space field" << std::endl;
+    envGetTmp(Field, phift);
+    envGetTmp(ComplexField, ca);
+    phift = Zero();
+    for (int a = 0; a < Nadj; ++a) 
+    {
+        Site ta;
+
+        gaussian(rng, ca);
+        Group::generator(a, ta);
+        phift += ca*ta;
+    }
+    phift *= w;
+    LOG(Message) << "Field Fourier transform" << std::endl;
+    fft.FFT_all_dim(phi, phift, FFT::backward);
+    phi = 0.5*(phi - adj(phi));
+    trphi2 = -TensorRemove(sum(trace(phi*phi))).real()/vol;
+    LOG(Message) << "tr(phi^2)= " << trphi2 << std::endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_StochFreeField_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/TrKinetic.cc b/Hadrons/Modules/MScalarSUN/TrKinetic.cc
new file mode 100644
index 00000000..f3823264
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TrKinetic.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TrKinetic.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TTrKinetic<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TTrKinetic<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TTrKinetic<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TTrKinetic<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TTrKinetic<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/TrKinetic.hpp b/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
new file mode 100644
index 00000000..a12df377
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
@@ -0,0 +1,178 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TrKinetic_hpp_
+#define Hadrons_MScalarSUN_TrKinetic_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Trace of kinetic term                              *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TrKineticPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrKineticPar,
+                                    std::string,  field,
+                                    DiffType,     type,
+                                    std::string,  output);
+};
+
+class TrKineticResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrKineticResult,
+                                    std::vector<std::vector<Complex>>, value,
+                                    DiffType,                          type);
+};
+
+template <typename SImpl>
+class TTrKinetic: public Module<TrKineticPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+public:
+    // constructor
+    TTrKinetic(const std::string name);
+    // destructor
+    virtual ~TTrKinetic(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(TrKineticSU2, TTrKinetic<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrKineticSU3, TTrKinetic<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrKineticSU4, TTrKinetic<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrKineticSU5, TTrKinetic<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrKineticSU6, TTrKinetic<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                      TTrKinetic implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTrKinetic<SImpl>::TTrKinetic(const std::string name)
+: Module<TrKineticPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTrKinetic<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTrKinetic<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out ;
+
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        out.push_back(varName(getName(), mu, nu));
+    }
+    out.push_back(varName(getName(), "sum"));
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrKinetic<SImpl>::setup(void)
+{
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        envCreateLat(ComplexField, varName(getName(), mu, nu));
+    }
+    envCreateLat(ComplexField, varName(getName(), "sum"));
+    envTmp(std::vector<Field>, "der", 1, env().getNd(), envGetGrid(Field));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrKinetic<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing tr(d_mu phi*d_nu phi) using " << par().type
+                 << " derivative" << std::endl; 
+
+    const unsigned int nd = env().getNd();
+    TrKineticResult    result;
+    auto               &phi    = envGet(Field, par().field);
+    auto               &sumkin = envGet(ComplexField, varName(getName(), "sum"));
+
+    envGetTmp(std::vector<Field>, der);
+    sumkin = Zero();
+    if (!par().output.empty())
+    {
+        result.type = par().type;
+        result.value.resize(nd, std::vector<Complex>(nd));
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        dmu(der[mu], phi, mu, par().type);
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    for (unsigned int nu = mu; nu < nd; ++nu)
+    {
+        auto &out = envGet(ComplexField, varName(getName(), mu, nu));
+
+        out = -trace(der[mu]*der[nu]);
+        if (mu == nu)
+        {
+            sumkin += out;
+        }
+        if (!par().output.empty())
+        {
+            result.value[mu][nu] = TensorRemove(sum(out));
+            result.value[mu][nu] = result.value[nu][mu];
+        }
+    }
+    saveResult(par().output, "trkinetic", result);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TrKinetic_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/TrMag.cc b/Hadrons/Modules/MScalarSUN/TrMag.cc
new file mode 100644
index 00000000..88c8f547
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TrMag.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TrMag.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TTrMag<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TTrMag<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TTrMag<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TTrMag<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TTrMag<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/TrMag.hpp b/Hadrons/Modules/MScalarSUN/TrMag.hpp
new file mode 100644
index 00000000..b9602be3
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -0,0 +1,148 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TrMag.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TrMag_hpp_
+#define Hadrons_MScalarSUN_TrMag_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                     Trace of powers of the magnetisation                   *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TrMagPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrMagPar,
+                                    std::string,  field,
+                                    unsigned int, maxPow,
+                                    std::string,  output);
+};
+
+class TrMagResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrMagResult,
+                                    std::string, op,
+                                    Real,        value);
+};
+
+template <typename SImpl>
+class TTrMag: public Module<TrMagPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+public:
+    // constructor
+    TTrMag(const std::string name);
+    // destructor
+    virtual ~TTrMag(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(TrMagSU2, TTrMag<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrMagSU3, TTrMag<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrMagSU4, TTrMag<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrMagSU5, TTrMag<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrMagSU6, TTrMag<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                         TTrMag implementation                              *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTrMag<SImpl>::TTrMag(const std::string name)
+: Module<TrMagPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTrMag<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTrMag<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrMag<SImpl>::setup(void)
+{}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrMag<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing tr(mag^n) for n even up to " << par().maxPow
+                 << std::endl;
+
+    std::vector<TrMagResult> result;
+    auto                     &phi = envGet(Field, par().field);
+
+    auto m2 = sum(phi);
+    auto mn = m2;
+
+    m2 = -m2*m2;
+    mn = 1.;
+    for (unsigned int n = 2; n <= par().maxPow; n += 2)
+    {
+        TrMagResult r;
+
+        mn = mn*m2;
+        r.op    = "tr(mag^" + std::to_string(n) + ")";
+        r.value = TensorRemove(trace(mn)).real();
+        result.push_back(r);
+    }
+    saveResult(par().output, "trmag", result);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TrMag_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/TrPhi.cc b/Hadrons/Modules/MScalarSUN/TrPhi.cc
new file mode 100644
index 00000000..bb16f1bf
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TrPhi.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TrPhi.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TTrPhi<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TTrPhi<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TTrPhi<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TTrPhi<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TTrPhi<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/TrPhi.hpp b/Hadrons/Modules/MScalarSUN/TrPhi.hpp
new file mode 100644
index 00000000..ecc0b8d3
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TrPhi.hpp
@@ -0,0 +1,173 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TrPhi.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TrPhi_hpp_
+#define Hadrons_MScalarSUN_TrPhi_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                      Trace of powers of a scalar field                     *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TrPhiPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrPhiPar,
+                                    std::string,  field,
+                                    unsigned int, maxPow,
+                                    std::string,  output);
+};
+
+class TrPhiResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrPhiResult,
+                                    std::string, op,
+                                    Real,        value);
+};
+
+template <typename SImpl>
+class TTrPhi: public Module<TrPhiPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+
+public:
+    // constructor
+    TTrPhi(const std::string name);
+    // destructor
+    virtual ~TTrPhi(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(TrPhiSU2, TTrPhi<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrPhiSU3, TTrPhi<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrPhiSU4, TTrPhi<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrPhiSU5, TTrPhi<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(TrPhiSU6, TTrPhi<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                          TTrPhi implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTrPhi<SImpl>::TTrPhi(const std::string name)
+: Module<TrPhiPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTrPhi<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTrPhi<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out;
+
+    for (unsigned int n = 2; n <= par().maxPow; n += 2)
+    {
+        out.push_back(varName(getName(), n));
+    }
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrPhi<SImpl>::setup(void)
+{
+    if (par().maxPow < 2)
+    {
+        HADRONS_ERROR(Size, "'maxPow' should be at least equal to 2");
+    }
+    envTmpLat(Field, "phi2");
+    envTmpLat(Field, "buf");
+    for (unsigned int n = 2; n <= par().maxPow; n += 2)
+    {
+        envCreateLat(ComplexField, varName(getName(), n));
+    }
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTrPhi<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing tr(phi^n) for n even up to " << par().maxPow
+                 << std::endl; 
+
+    std::vector<TrPhiResult> result;
+    auto                     &phi = envGet(Field, par().field);
+
+    envGetTmp(Field, phi2);
+    envGetTmp(Field, buf);
+    buf  = 1.;
+    phi2 = -phi*phi; 
+    for (unsigned int n = 2; n <= par().maxPow; n += 2)
+    {
+        auto &phin = envGet(ComplexField, varName(getName(), n));
+
+        buf  = buf*phi2;
+        phin = trace(buf);
+        if (!par().output.empty())
+        {
+            TrPhiResult r;
+
+            r.op    = "tr(phi^" + std::to_string(n) + ")";
+            r.value = TensorRemove(sum(phin)).real();
+            result.push_back(r);
+        }
+    }
+    if (result.size() > 0)
+    {
+        saveResult(par().output, "trphi", result);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TrPhi_hpp_
diff --git a/extras/Hadrons/Global.cc b/Hadrons/Modules/MScalarSUN/TransProj.cc
similarity index 59%
rename from extras/Hadrons/Global.cc
rename to Hadrons/Modules/MScalarSUN/TransProj.cc
index 4a39af45..47cfe876 100644
--- a/extras/Hadrons/Global.cc
+++ b/Hadrons/Modules/MScalarSUN/TransProj.cc
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Global.cc
+Source file: Hadrons/Modules/MScalarSUN/TransProj.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -26,30 +25,16 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-
-#include <Grid/Hadrons/Global.hpp>
+#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
 
 using namespace Grid;
- 
 using namespace Hadrons;
+using namespace MScalarSUN;
 
-HadronsLogger Hadrons::HadronsLogError(1,"Error");
-HadronsLogger Hadrons::HadronsLogWarning(1,"Warning");
-HadronsLogger Hadrons::HadronsLogMessage(1,"Message");
-HadronsLogger Hadrons::HadronsLogIterative(1,"Iterative");
-HadronsLogger Hadrons::HadronsLogDebug(1,"Debug");
+template class Grid::Hadrons::MScalarSUN::TTransProj<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TTransProj<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TTransProj<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TTransProj<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TTransProj<ScalarNxNAdjImplR<6>>;
 
-// type utilities //////////////////////////////////////////////////////////////
-//constexpr unsigned int maxNameSize = 1024u;
-
-std::string Hadrons::typeName(const std::type_info *info)
-{
-    char        *buf;
-    std::string name;
     
-    buf  = abi::__cxa_demangle(info->name(), nullptr, nullptr, nullptr);
-    name = buf;
-    free(buf);
-    
-    return name;
-}
diff --git a/Hadrons/Modules/MScalarSUN/TransProj.hpp b/Hadrons/Modules/MScalarSUN/TransProj.hpp
new file mode 100644
index 00000000..59b7dc71
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TransProj.hpp
@@ -0,0 +1,187 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TransProj.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TransProj_hpp_
+#define Hadrons_MScalarSUN_TransProj_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         Transverse projection                              *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TransProjPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TransProjPar,
+                                    std::string,  op,
+                                    DiffType,     type,
+                                    std::string,  output);
+};
+
+class TransProjResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TransProjResult,
+                                    std::vector<std::vector<Complex>>, value,
+                                    DiffType,                          type);
+};
+
+template <typename SImpl>
+class TTransProj: public Module<TransProjPar>
+{
+public:
+    typedef typename SImpl::Field        Field;
+    typedef typename SImpl::ComplexField ComplexField;
+public:
+    // constructor
+    TTransProj(const std::string name);
+    // destructor
+    virtual ~TTransProj(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(TransProjSU2, TTransProj<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(TransProjSU3, TTransProj<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(TransProjSU4, TTransProj<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(TransProjSU5, TTransProj<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(TransProjSU6, TTransProj<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                        TTransProj implementation                           *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTransProj<SImpl>::TTransProj(const std::string name)
+: Module<TransProjPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTransProj<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().op};
+    
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTransProj<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out;
+    
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        out.push_back(varName(getName(), mu, nu));
+    }
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTransProj<SImpl>::setup(void)
+{
+    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    {
+        envCreateLat(ComplexField, varName(getName(), mu, nu));
+    }
+    envTmpLat(ComplexField, "buf1");
+    envTmpLat(ComplexField, "buf2");
+    envTmpLat(ComplexField, "lap");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTransProj<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing (delta_mu,nu d^2 - d_mu*d_nu)*op using " 
+                 << par().type << " derivatives and op= '" << par().op 
+                 << "'" << std::endl; 
+
+    const unsigned int nd = env().getNd();
+    TransProjResult    result;
+    auto               &op = envGet(ComplexField, par().op);
+
+    envGetTmp(ComplexField, buf1);
+    envGetTmp(ComplexField, buf2);
+    envGetTmp(ComplexField, lap);
+    lap = Zero();
+    if (!par().output.empty())
+    {
+        result.type = par().type;
+        result.value.resize(nd, std::vector<Complex>(nd));
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    {
+        dmu(buf1, op, mu, par().type);
+        dmu(buf2, buf1, mu, par().type);
+        lap += buf2;
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    for (unsigned int nu = mu; nu < nd; ++nu)
+    {
+        auto &out = envGet(ComplexField, varName(getName(), mu, nu));
+        dmu(buf1, op, mu, par().type);
+        dmu(buf2, buf1, nu, par().type);
+        out = -buf2;
+        if (mu == nu)
+        {
+            out += lap;
+        }
+        if (!par().output.empty())
+        {
+            result.value[mu][nu] = TensorRemove(sum(out));
+            result.value[mu][nu] = result.value[nu][mu];
+        }
+    }
+    if (!par().output.empty())
+    {
+        saveResult(par().output, "transproj", result);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TransProj_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/TwoPoint.cc b/Hadrons/Modules/MScalarSUN/TwoPoint.cc
new file mode 100644
index 00000000..9cb8b7ad
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TwoPoint.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TwoPoint.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TTwoPoint<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPoint<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPoint<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPoint<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPoint<ScalarNxNAdjImplR<6>>;
+
diff --git a/Hadrons/Modules/MScalarSUN/TwoPoint.hpp b/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
new file mode 100644
index 00000000..df0d2cba
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -0,0 +1,226 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TwoPoint_hpp_
+#define Hadrons_MScalarSUN_TwoPoint_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                 2-pt functions for a given set of operators                *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TwoPointPar: Serializable
+{
+public:
+    typedef std::pair<std::string, std::string> OpPair;
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TwoPointPar,
+                                    std::vector<OpPair>,      op,
+                                    std::vector<std::string>, mom,
+                                    std::string,              output);
+};
+
+class TwoPointResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TwoPointResult,
+                                    std::string, sink,
+                                    std::string, source,
+                                    std::vector<int>, mom,
+                                    std::vector<Complex>, data);
+};
+
+template <typename SImpl>
+class TTwoPoint: public Module<TwoPointPar>
+{
+public:
+    typedef typename SImpl::Field         Field;
+    typedef typename SImpl::ComplexField  ComplexField;
+    typedef          std::vector<Complex> SlicedOp;
+public:
+    // constructor
+    TTwoPoint(const std::string name);
+    // destructor
+    virtual ~TTwoPoint(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    std::vector<std::vector<int>> mom_;
+};
+
+MODULE_REGISTER_TMP(TwoPointSU2, TTwoPoint<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointSU3, TTwoPoint<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointSU4, TTwoPoint<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointSU5, TTwoPoint<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointSU6, TTwoPoint<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                       TTwoPoint implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTwoPoint<SImpl>::TTwoPoint(const std::string name)
+: Module<TwoPointPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTwoPoint<SImpl>::getInput(void)
+{   
+    std::vector<std::string> in;
+    std::set<std::string>    ops;
+
+    for (auto &p: par().op)
+    {
+        ops.insert(p.first);
+        ops.insert(p.second);
+    }
+    for (auto &o: ops)
+    {
+        in.push_back(o);
+    }
+
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTwoPoint<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTwoPoint<SImpl>::setup(void)
+{
+    const unsigned int nd = env().getDim().size();
+
+    mom_.resize(par().mom.size());
+    for (unsigned int i = 0; i < mom_.size(); ++i)
+    {
+        mom_[i] = strToVec<int>(par().mom[i]);
+        if (mom_[i].size() != nd - 1)
+        {
+            HADRONS_ERROR(Size, "momentum number of components different from " 
+                               + std::to_string(nd-1));
+        }
+        for (unsigned int j = 0; j < nd - 1; ++j)
+        {
+            mom_[i][j] = (mom_[i][j] + env().getDim(j)) % env().getDim(j);
+        }
+    }
+    envTmpLat(ComplexField, "ftBuf");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTwoPoint<SImpl>::execute(void)
+{
+    LOG(Message) << "Computing 2-point functions" << std::endl;
+    for (auto &p: par().op)
+    {
+        LOG(Message) << "  <" << p.first << " " << p.second << ">" << std::endl;
+    }
+
+    const unsigned int                           nd      = env().getNd();
+    const unsigned int                           nt      = env().getDim().back();
+    const unsigned int                           nop     = par().op.size();
+    const unsigned int                           nmom    = mom_.size();
+    double                                       partVol = 1.;
+    std::vector<int>                             dMask(nd, 1);
+    std::set<std::string>                        ops;
+    std::vector<TwoPointResult>                  result;
+    std::map<std::string, std::vector<SlicedOp>> slicedOp;
+    FFT                                          fft(envGetGrid(Field));
+    TComplex                                     buf;
+
+    envGetTmp(ComplexField, ftBuf);
+    dMask[nd - 1] = 0;
+    for (unsigned int mu = 0; mu < nd - 1; ++mu)
+    {
+        partVol *= env().getDim()[mu];
+    }
+    for (auto &p: par().op)
+    {
+        ops.insert(p.first);
+        ops.insert(p.second);
+    }
+    for (auto &o: ops)
+    {
+        auto &op = envGet(ComplexField, o);
+
+        slicedOp[o].resize(nmom);
+        LOG(Message) << "Operator '" << o << "' FFT" << std::endl;
+        fft.FFT_dim_mask(ftBuf, op, dMask, FFT::forward);
+        for (unsigned int m = 0; m < nmom; ++m)
+        {
+            auto qt = mom_[m];
+
+            qt.resize(nd);
+            slicedOp[o][m].resize(nt);
+            for (unsigned int t = 0; t < nt; ++t)
+            {
+                qt[nd - 1] = t;
+                peekSite(buf, ftBuf, qt);
+                slicedOp[o][m][t] = TensorRemove(buf);
+            }
+        }
+    }
+    LOG(Message) << "Making contractions" << std::endl;
+    for (unsigned int m = 0; m < nmom; ++m)
+    for (auto &p: par().op)
+    {
+        TwoPointResult r;
+
+        r.sink   = p.first;
+        r.source = p.second;
+        r.mom    = mom_[m];
+        r.data   = makeTwoPoint(slicedOp[p.first][m], slicedOp[p.second][m], 
+                                1./partVol);
+        result.push_back(r);
+    }
+    saveResult(par().output, "twopt", result);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TwoPoint_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/TwoPointNPR.cc b/Hadrons/Modules/MScalarSUN/TwoPointNPR.cc
new file mode 100644
index 00000000..a7b4cd4f
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TwoPointNPR.cc
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TwoPointNPR.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MScalarSUN;
+
+template class Grid::Hadrons::MScalarSUN::TTwoPointNPR<ScalarNxNAdjImplR<2>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPointNPR<ScalarNxNAdjImplR<3>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPointNPR<ScalarNxNAdjImplR<4>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPointNPR<ScalarNxNAdjImplR<5>>;
+template class Grid::Hadrons::MScalarSUN::TTwoPointNPR<ScalarNxNAdjImplR<6>>;
diff --git a/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp b/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
new file mode 100644
index 00000000..6a18f6cb
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
@@ -0,0 +1,218 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_TwoPointNPR_hpp_
+#define Hadrons_MScalarSUN_TwoPointNPR_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         TwoPointNPR                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+class TwoPointNPRPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TwoPointNPRPar,
+                                    std::vector<std::string>, op,
+                                    std::string,              field,
+                                    std::string,              output);
+};
+
+class TwoPointNPRResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TwoPointNPRResult,
+                                    std::string,          op,
+                                    std::vector<Complex>, data);
+};
+
+template <typename SImpl>
+class TTwoPointNPR: public Module<TwoPointNPRPar>
+{
+public:
+    typedef typename SImpl::Field                    Field;
+    typedef typename SImpl::SiteField::scalar_object Site;
+    typedef typename SImpl::ComplexField             ComplexField;
+public:
+    // constructor
+    TTwoPointNPR(const std::string name);
+    // destructor
+    virtual ~TTwoPointNPR(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(TwoPointNPRSU2, TTwoPointNPR<ScalarNxNAdjImplR<2>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointNPRSU3, TTwoPointNPR<ScalarNxNAdjImplR<3>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointNPRSU4, TTwoPointNPR<ScalarNxNAdjImplR<4>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointNPRSU5, TTwoPointNPR<ScalarNxNAdjImplR<5>>, MScalarSUN);
+MODULE_REGISTER_TMP(TwoPointNPRSU6, TTwoPointNPR<ScalarNxNAdjImplR<6>>, MScalarSUN);
+
+/******************************************************************************
+ *                 TTwoPointNPR implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename SImpl>
+TTwoPointNPR<SImpl>::TTwoPointNPR(const std::string name)
+: Module<TwoPointNPRPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename SImpl>
+std::vector<std::string> TTwoPointNPR<SImpl>::getInput(void)
+{
+    std::vector<std::string> in = par().op;
+
+    in.push_back(par().field);
+
+    return in;
+}
+
+template <typename SImpl>
+std::vector<std::string> TTwoPointNPR<SImpl>::getOutput(void)
+{
+    std::vector<std::string> out;
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTwoPointNPR<SImpl>::setup(void)
+{
+    const unsigned int nl = env().getDim(0);
+
+    for (unsigned int mu = 1; mu < env().getNd(); ++mu)
+    {
+        if (nl != env().getDim(mu))
+        {
+            HADRONS_ERROR(Size, "non-cubic grid");
+        }
+    }
+    envTmpLat(ComplexField, "ftBuf");
+    envTmpLat(Field, "ftMatBuf");
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename SImpl>
+void TTwoPointNPR<SImpl>::execute(void)
+{
+    const unsigned int             nd   = env().getNd();
+    const unsigned int             nl   = env().getDim(0);
+    const Real                     invV = 1./env().getVolume();
+    FFT                            fft(envGetGrid(Field));
+    std::vector<TwoPointNPRResult> result;
+    TwoPointNPRResult              twoPtp1, twoPtp2, twoPtDisc;
+    auto                           &phi    = envGet(Field, par().field);
+    bool                           doAux = true;
+
+    envGetTmp(ComplexField, ftBuf);
+    envGetTmp(Field, ftMatBuf);
+    LOG(Message) << "FFT: field '" << par().field << "'" << std::endl;
+    fft.FFT_all_dim(ftMatBuf, phi, FFT::forward);
+    for (auto &opName: par().op)
+    {
+        auto              &op = envGet(ComplexField, opName);
+        std::vector<int>  p1, p2, p;
+        Site              phip1, phip2;
+        TComplex          opp;
+        TwoPointNPRResult r, rDisc;
+
+        LOG(Message) << "FFT: operator '" << opName << "'" << std::endl;
+        fft.FFT_all_dim(ftBuf, op, FFT::forward);
+        LOG(Message) << "Generating vertex function" << std::endl;
+        r.op = opName;
+        r.data.resize(nl);
+        rDisc.op = opName + "_disc";
+        rDisc.data.resize(nl);
+        if (doAux)
+        {
+            twoPtp1.op = "phi_prop_p1";
+            twoPtp1.data.resize(nl);
+            twoPtp2.op = "phi_prop_p2";
+            twoPtp2.data.resize(nl);
+            twoPtDisc.op = "phi_prop_disc";
+            twoPtDisc.data.resize(nl);
+        }
+        for (unsigned int n = 0; n < nl; ++n)
+        {
+            p1.assign(nd, 0);
+            p2.assign(nd, 0);
+            p.assign(nd, 0);
+            // non-exceptional RI/SMOM kinematic
+            // p1 = mu*(1,1,0): in mom
+            // p2 = mu*(0,1,1): out mom
+            // p  = p1 - p2 = mu*(1,0,-1)
+            // mu = 2*n*pi/L
+            p1[0] = n;
+            p1[1] = n;
+            p2[1] = n;
+            p2[2] = n;
+            p[0]  = n;
+            p[2]  = (nl - n) % nl;
+            peekSite(phip1, ftMatBuf, p1);
+            peekSite(phip2, ftMatBuf, p2);
+            peekSite(opp, ftBuf, p);
+            if (doAux)
+            {
+                twoPtp1.data[n]   = invV*TensorRemove(trace(phip1*adj(phip1)));
+                twoPtp2.data[n]   = invV*TensorRemove(trace(phip2*adj(phip2)));
+                twoPtDisc.data[n] = invV*TensorRemove(trace(phip2*adj(phip1)));
+            }
+            r.data[n]     = invV*TensorRemove(trace(phip2*adj(phip1))*opp);
+            rDisc.data[n] = invV*TensorRemove(trace(phip1*adj(phip1))*opp);
+        }
+        if (doAux)
+        {
+            result.push_back(twoPtp1);
+            result.push_back(twoPtp2);
+            result.push_back(twoPtDisc);
+        }
+        result.push_back(r);
+        result.push_back(rDisc);
+        doAux = false;
+    }
+    saveResult(par().output, "twoptnpr", result);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_TwoPointNPR_hpp_
diff --git a/Hadrons/Modules/MScalarSUN/Utils.hpp b/Hadrons/Modules/MScalarSUN/Utils.hpp
new file mode 100644
index 00000000..7eba5900
--- /dev/null
+++ b/Hadrons/Modules/MScalarSUN/Utils.hpp
@@ -0,0 +1,134 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MScalarSUN/Utils.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MScalarSUN_Utils_hpp_
+#define Hadrons_MScalarSUN_Utils_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+BEGIN_MODULE_NAMESPACE(MScalarSUN)
+
+GRID_SERIALIZABLE_ENUM(DiffType, undef, forward, 1, backward, 2, central, 3);
+
+template <typename Field>
+inline void dmu(Field &out, const Field &in, const unsigned int mu, const DiffType type)
+{
+    auto & env = Environment::getInstance();
+
+    if (mu >= env.getNd())
+    {
+        HADRONS_ERROR(Range, "Derivative direction out of range");
+    }
+    switch(type)
+    {
+        case DiffType::backward:
+            out = in - Cshift(in, mu, -1);
+            break;
+        case DiffType::forward:
+            out = Cshift(in, mu, 1) - in;
+            break;
+        case DiffType::central:
+            out = 0.5*(Cshift(in, mu, 1) - Cshift(in, mu, -1));
+            break;
+        default:
+            HADRONS_ERROR(Argument, "Derivative type invalid");
+            break;
+    }
+}
+
+template <typename Field>
+inline void dmuAcc(Field &out, const Field &in, const unsigned int mu, const DiffType type)
+{
+    auto & env = Environment::getInstance();
+
+    if (mu >= env.getNd())
+    {
+        HADRONS_ERROR(Range, "Derivative direction out of range");
+    }
+    switch(type)
+    {
+        case DiffType::backward:
+            out += in - Cshift(in, mu, -1);
+            break;
+        case DiffType::forward:
+            out += Cshift(in, mu, 1) - in;
+            break;
+        case DiffType::central:
+            out += 0.5*(Cshift(in, mu, 1) - Cshift(in, mu, -1));
+            break;
+        default:
+            HADRONS_ERROR(Argument, "Derivative type invalid");
+            break;
+    }
+}
+
+template <class SinkSite, class SourceSite>
+std::vector<Complex> makeTwoPoint(const std::vector<SinkSite>   &sink,
+                                  const std::vector<SourceSite> &source,
+                                  const double factor = 1.)
+{
+    assert(sink.size() == source.size());
+    
+    unsigned int         nt = sink.size();
+    std::vector<Complex> res(nt, 0.);
+    
+    for (unsigned int dt = 0; dt < nt; ++dt)
+    {
+        for (unsigned int t  = 0; t < nt; ++t)
+        {
+            res[dt] += trace(sink[(t+dt)%nt]*adj(source[t]));
+        }
+        res[dt] *= factor/static_cast<double>(nt);
+    }
+    
+    return res;
+}
+
+inline std::string varName(const std::string name, const std::string suf)
+{
+    return name + "_" + suf;
+}
+
+inline std::string varName(const std::string name, const unsigned int mu)
+{
+    return varName(name, std::to_string(mu));
+}
+
+inline std::string varName(const std::string name, const unsigned int mu, 
+                           const unsigned int nu)
+{
+    return varName(name, std::to_string(mu) + "_" + std::to_string(nu));
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MScalarSUN_Utils_hpp_
diff --git a/Hadrons/Modules/MSink/Point.cc b/Hadrons/Modules/MSink/Point.cc
new file mode 100644
index 00000000..b1deaa64
--- /dev/null
+++ b/Hadrons/Modules/MSink/Point.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSink/Point.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSink/Point.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSink;
+
+template class Grid::Hadrons::MSink::TPoint<FIMPL>;
+template class Grid::Hadrons::MSink::TPoint<ScalarImplCR>;
+
diff --git a/extras/Hadrons/Modules/MSink/Point.hpp b/Hadrons/Modules/MSink/Point.hpp
similarity index 88%
rename from extras/Hadrons/Modules/MSink/Point.hpp
rename to Hadrons/Modules/MSink/Point.hpp
index e3bb95fe..22346087 100644
--- a/extras/Hadrons/Modules/MSink/Point.hpp
+++ b/Hadrons/Modules/MSink/Point.hpp
@@ -2,11 +2,13 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSink/Point.hpp
+Source file: Hadrons/Modules/MSink/Point.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +31,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSink_Point_hpp_
 #define Hadrons_MSink_Point_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -51,13 +53,13 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
     SINK_TYPE_ALIASES();
 public:
     // constructor
     TPoint(const std::string name);
     // destructor
-    virtual ~TPoint(void) = default;
+    virtual ~TPoint(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -71,8 +73,8 @@ private:
     std::string momphName_;
 };
 
-MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
-MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
+MODULE_REGISTER_TMP(Point,       TPoint<FIMPL>,        MSink);
+MODULE_REGISTER_TMP(ScalarPoint, TPoint<ScalarImplCR>, MSink);
 
 /******************************************************************************
  *                          TPoint implementation                             *
@@ -127,10 +129,10 @@ void TPoint<FImpl>::execute(void)
         envGetTmp(LatticeComplex, coor);
         p  = strToVec<Real>(par().mom);
         ph = Zero();
-        for(unsigned int mu = 0; mu < env().getNd(); mu++)
+        for(unsigned int mu = 0; mu < p.size(); mu++)
         {
             LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
         }
         ph = exp((Real)(2*M_PI)*i*ph);
         hasPhase_ = true;
diff --git a/Hadrons/Modules/MSink/Smear.cc b/Hadrons/Modules/MSink/Smear.cc
new file mode 100644
index 00000000..aacceedf
--- /dev/null
+++ b/Hadrons/Modules/MSink/Smear.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSink/Smear.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSink/Smear.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSink;
+
+template class Grid::Hadrons::MSink::TSmear<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MSink/Smear.hpp b/Hadrons/Modules/MSink/Smear.hpp
similarity index 91%
rename from extras/Hadrons/Modules/MSink/Smear.hpp
rename to Hadrons/Modules/MSink/Smear.hpp
index e85ab263..59a8b9cd 100644
--- a/extras/Hadrons/Modules/MSink/Smear.hpp
+++ b/Hadrons/Modules/MSink/Smear.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSink/Smear.hpp
+Source file: Hadrons/Modules/MSink/Smear.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSink_Smear_hpp_
 #define Hadrons_MSink_Smear_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -58,7 +59,7 @@ public:
     // constructor
     TSmear(const std::string name);
     // destructor
-    virtual ~TSmear(void) = default;
+    virtual ~TSmear(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -69,7 +70,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Smear, TSmear<FIMPL>, MSink);
+MODULE_REGISTER_TMP(Smear, TSmear<FIMPL>, MSink);
 
 /******************************************************************************
  *                          TSmear implementation                             *
diff --git a/Hadrons/Modules/MSolver/A2AAslashVectors.cc b/Hadrons/Modules/MSolver/A2AAslashVectors.cc
new file mode 100644
index 00000000..74e9f59f
--- /dev/null
+++ b/Hadrons/Modules/MSolver/A2AAslashVectors.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<FIMPL>;
+template class Grid::Hadrons::MSolver::TA2AAslashVectors<ZFIMPL>;
diff --git a/Hadrons/Modules/MSolver/A2AAslashVectors.hpp b/Hadrons/Modules/MSolver/A2AAslashVectors.hpp
new file mode 100644
index 00000000..8531c6ba
--- /dev/null
+++ b/Hadrons/Modules/MSolver/A2AAslashVectors.hpp
@@ -0,0 +1,194 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AAslashVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_A2AAslashVectors_hpp_
+#define Hadrons_MSolver_A2AAslashVectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
+#include <Hadrons/A2AVectors.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Create all-to-all V & W vectors                      *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+/****************************************************************************
+*  Calculate a sequential propagator on an insertion of i*g_mu*A_mu 
+*  on an A2A vector
+*
+*  vv_i(y) = S(y,x) * i * g_mu*A_mu(x) * v_i(x)
+*
+*  with
+*
+*  - vector: A2A vector v_i(x)
+*  - emField: A_mu(x): electromagnetic photon field
+*  - solver: the solver for calculating the sequential propagator
+*
+*****************************************************************************/
+
+class A2AAslashVectorsPar: Serializable
+{
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(A2AAslashVectorsPar,
+                                  std::string, vector,
+                                  std::string, emField,
+                                  std::string, solver,
+                                  std::string, output,
+                                  bool,        multiFile);
+};
+
+template <typename FImpl>
+class TA2AAslashVectors : public Module<A2AAslashVectorsPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
+public:
+    typedef PhotonR::GaugeField EmField;
+public:
+    // constructor
+    TA2AAslashVectors(const std::string name);
+    // destructor
+    virtual ~TA2AAslashVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    unsigned int Ls_;
+};
+
+MODULE_REGISTER_TMP(A2AAslashVectors, TA2AAslashVectors<FIMPL>, MSolver);
+MODULE_REGISTER_TMP(ZA2AAslashVectors, TA2AAslashVectors<ZFIMPL>, MSolver);
+
+/******************************************************************************
+ *                       TA2AAslashVectors implementation                       *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TA2AAslashVectors<FImpl>::TA2AAslashVectors(const std::string name)
+: Module<A2AAslashVectorsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TA2AAslashVectors<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().vector, par().emField, par().solver};
+
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TA2AAslashVectors<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AAslashVectors<FImpl>::setup(void)
+{
+    Ls_  = env().getObjectLs(par().solver);
+    auto &vvector = envGet(std::vector<FermionField>, par().vector);
+    unsigned int Nmodes = vvector.size();
+    envCreate(std::vector<FermionField>, getName(), 1, 
+              Nmodes, envGetGrid(FermionField));
+   
+    envTmpLat(FermionField, "v4dtmp");
+    envTmpLat(FermionField, "v5dtmp", Ls_);
+    envTmpLat(FermionField, "v5dtmp_sol", Ls_);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AAslashVectors<FImpl>::execute(void)
+{
+    auto &solver = envGet(Solver, par().solver);
+    auto &stoch_photon = envGet(EmField,  par().emField);
+    auto &vvector = envGet(std::vector<FermionField>, par().vector);
+    auto &Aslashv = envGet(std::vector<FermionField>, getName());
+    unsigned int Nmodes = vvector.size();
+    auto &mat = solver.getFMat();
+    envGetTmp(FermionField, v4dtmp);
+    envGetTmp(FermionField, v5dtmp);
+    envGetTmp(FermionField, v5dtmp_sol);
+
+    Complex ci(0.0,1.0);
+
+    startTimer("Seq Aslash");
+    LOG(Message) << "Calculate Sequential propagator on Aslash * v with the A2A vector " 
+                 << par().vector << " and the photon field " << par().emField << std::endl;
+    for(unsigned int i=0; i<Nmodes; i++)
+    {
+        v4dtmp = Zero();
+        startTimer("Multiply Aslash");
+        for(unsigned int mu=0;mu<=3;mu++)
+        {
+            Gamma gmu(Gamma::gmu[mu]);
+            v4dtmp +=  ci * PeekIndex<LorentzIndex>(stoch_photon, mu) * (gmu * vvector[i]);
+        }
+        stopTimer("Multiply Aslash");
+
+        startTimer("Inversion");
+        if (Ls_ == 1)
+        {
+            solver(Aslashv[i], v4dtmp);
+        }
+        else
+        {
+            mat.ImportPhysicalFermionSource(v4dtmp, v5dtmp);
+            solver(v5dtmp_sol, v5dtmp);
+            mat.ExportPhysicalFermionSolution(v5dtmp_sol, v4dtmp);
+            Aslashv[i] = v4dtmp;
+        }
+        stopTimer("Inversion");
+    }
+    stopTimer("Seq Aslash");
+    if (!par().output.empty())
+    {
+        startTimer("I/O");
+        A2AVectorsIo::write(par().output, Aslashv, par().multiFile, vm().getTrajectory());
+        stopTimer("I/O");
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_A2AAslashVectors_hpp_
diff --git a/Hadrons/Modules/MSolver/A2AVectors.cc b/Hadrons/Modules/MSolver/A2AVectors.cc
new file mode 100644
index 00000000..f5e9fd6c
--- /dev/null
+++ b/Hadrons/Modules/MSolver/A2AVectors.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: fionnoh <fionnoh@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/A2AVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TA2AVectors<FIMPL, BaseFermionEigenPack<FIMPL>>;
+template class Grid::Hadrons::MSolver::TA2AVectors<ZFIMPL, BaseFermionEigenPack<ZFIMPL>>;
diff --git a/Hadrons/Modules/MSolver/A2AVectors.hpp b/Hadrons/Modules/MSolver/A2AVectors.hpp
new file mode 100644
index 00000000..f9980ee3
--- /dev/null
+++ b/Hadrons/Modules/MSolver/A2AVectors.hpp
@@ -0,0 +1,258 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/A2AVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: fionnoh <fionnoh@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_A2AVectors_hpp_
+#define Hadrons_MSolver_A2AVectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
+#include <Hadrons/EigenPack.hpp>
+#include <Hadrons/A2AVectors.hpp>
+#include <Hadrons/DilutedNoise.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                       Create all-to-all V & W vectors                      *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+class A2AVectorsPar: Serializable
+{
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(A2AVectorsPar,
+                                  std::string, noise,
+                                  std::string, action,
+                                  std::string, eigenPack,
+                                  std::string, solver,
+                                  std::string, output,
+                                  bool,        multiFile);
+};
+
+template <typename FImpl, typename Pack>
+class TA2AVectors : public Module<A2AVectorsPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
+    typedef HADRONS_DEFAULT_SCHUR_A2A<FImpl> A2A;
+public:
+    // constructor
+    TA2AVectors(const std::string name);
+    // destructor
+    virtual ~TA2AVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+private:
+    std::string  solverName_;
+    unsigned int Nl_{0};
+};
+
+MODULE_REGISTER_TMP(A2AVectors, 
+    ARG(TA2AVectors<FIMPL, BaseFermionEigenPack<FIMPL>>), MSolver);
+MODULE_REGISTER_TMP(ZA2AVectors, 
+    ARG(TA2AVectors<ZFIMPL, BaseFermionEigenPack<ZFIMPL>>), MSolver);
+
+/******************************************************************************
+ *                       TA2AVectors implementation                           *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, typename Pack>
+TA2AVectors<FImpl, Pack>::TA2AVectors(const std::string name)
+: Module<A2AVectorsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, typename Pack>
+std::vector<std::string> TA2AVectors<FImpl, Pack>::getInput(void)
+{
+    std::string              sub_string;
+    std::vector<std::string> in;
+
+    if (!par().eigenPack.empty())
+    {
+        in.push_back(par().eigenPack);
+        sub_string = (!par().eigenPack.empty()) ? "_subtract" : "";
+    }
+    in.push_back(par().solver + sub_string);
+    in.push_back(par().noise);
+
+    return in;
+}
+
+template <typename FImpl, typename Pack>
+std::vector<std::string> TA2AVectors<FImpl, Pack>::getOutput(void)
+{
+    std::vector<std::string> out = {getName() + "_v", getName() + "_w"};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, typename Pack>
+void TA2AVectors<FImpl, Pack>::setup(void)
+{
+    bool        hasLowModes = (!par().eigenPack.empty());
+    std::string sub_string  = (hasLowModes) ? "_subtract" : "";
+    auto        &noise      = envGet(DilutedNoise<FImpl>, par().noise);
+    auto        &action     = envGet(FMat, par().action);
+    auto        &solver     = envGet(Solver, par().solver + sub_string);
+    int         Ls          = env().getObjectLs(par().action);
+
+    if (hasLowModes)
+    {
+        auto &epack = envGet(Pack, par().eigenPack);
+        Nl_ = epack.evec.size();
+    }
+    envCreate(std::vector<FermionField>, getName() + "_v", 1, 
+              Nl_ + noise.size(), envGetGrid(FermionField));
+    envCreate(std::vector<FermionField>, getName() + "_w", 1, 
+              Nl_ + noise.size(), envGetGrid(FermionField));
+    if (Ls > 1)
+    {
+        envTmpLat(FermionField, "f5", Ls);
+    }
+    envTmp(A2A, "a2a", 1, action, solver);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, typename Pack>
+void TA2AVectors<FImpl, Pack>::execute(void)
+{
+    std::string sub_string = (Nl_ > 0) ? "_subtract" : "";
+    auto        &action    = envGet(FMat, par().action);
+    auto        &solver    = envGet(Solver, par().solver + sub_string);
+    auto        &noise     = envGet(DilutedNoise<FImpl>, par().noise);
+    auto        &v         = envGet(std::vector<FermionField>, getName() + "_v");
+    auto        &w         = envGet(std::vector<FermionField>, getName() + "_w");
+    int         Ls         = env().getObjectLs(par().action);
+
+    envGetTmp(A2A, a2a);
+
+    if (Nl_ > 0)
+    {
+        LOG(Message) << "Computing all-to-all vectors "
+                     << " using eigenpack '" << par().eigenPack << "' ("
+                     << Nl_ << " low modes) and noise '"
+                     << par().noise << "' (" << noise.size() 
+                     << " noise vectors)" << std::endl;
+    }
+    else
+    {
+        LOG(Message) << "Computing all-to-all vectors "
+                     << " using noise '" << par().noise << "' (" << noise.size() 
+                     << " noise vectors)" << std::endl;
+    }
+    // Low modes
+    for (unsigned int il = 0; il < Nl_; il++)
+    {
+        auto &epack  = envGet(Pack, par().eigenPack);
+
+        startTimer("V low mode");
+        LOG(Message) << "V vector i = " << il << " (low mode)" << std::endl;
+        if (Ls == 1)
+        {
+            a2a.makeLowModeV(v[il], epack.evec[il], epack.eval[il]);
+        }
+        else
+        {
+            envGetTmp(FermionField, f5);
+            a2a.makeLowModeV5D(v[il], f5, epack.evec[il], epack.eval[il]);
+        }
+        stopTimer("V low mode");
+        startTimer("W low mode");
+        LOG(Message) << "W vector i = " << il << " (low mode)" << std::endl;
+        if (Ls == 1)
+        {
+            a2a.makeLowModeW(w[il], epack.evec[il], epack.eval[il]);
+        }
+        else
+        {
+            envGetTmp(FermionField, f5);
+            a2a.makeLowModeW5D(w[il], f5, epack.evec[il], epack.eval[il]);
+        }
+        stopTimer("W low mode");
+    }
+
+    // High modes
+    for (unsigned int ih = 0; ih < noise.size(); ih++)
+    {
+        startTimer("V high mode");
+        LOG(Message) << "V vector i = " << Nl_ + ih
+                     << " (" << ((Nl_ > 0) ? "high " : "") 
+                     << "stochastic mode)" << std::endl;
+        if (Ls == 1)
+        {
+            a2a.makeHighModeV(v[Nl_ + ih], noise[ih]);
+        }
+        else
+        {
+            envGetTmp(FermionField, f5);
+            a2a.makeHighModeV5D(v[Nl_ + ih], f5, noise[ih]);
+        }
+        stopTimer("V high mode");
+        startTimer("W high mode");
+        LOG(Message) << "W vector i = " << Nl_ + ih
+                     << " (" << ((Nl_ > 0) ? "high " : "") 
+                     << "stochastic mode)" << std::endl;
+        if (Ls == 1)
+        {
+            a2a.makeHighModeW(w[Nl_ + ih], noise[ih]);
+        }
+        else
+        {
+            envGetTmp(FermionField, f5);
+            a2a.makeHighModeW5D(w[Nl_ + ih], f5, noise[ih]);
+        }
+        stopTimer("W high mode");
+    }
+
+    // I/O if necessary
+    if (!par().output.empty())
+    {
+        startTimer("V I/O");
+        A2AVectorsIo::write(par().output + "_v", v, par().multiFile, vm().getTrajectory());
+        stopTimer("V I/O");
+        startTimer("W I/O");
+        A2AVectorsIo::write(par().output + "_w", w, par().multiFile, vm().getTrajectory());
+        stopTimer("W I/O");
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_A2AVectors_hpp_
diff --git a/Hadrons/Modules/MSolver/Guesser.hpp b/Hadrons/Modules/MSolver/Guesser.hpp
new file mode 100644
index 00000000..7063198d
--- /dev/null
+++ b/Hadrons/Modules/MSolver/Guesser.hpp
@@ -0,0 +1,85 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/Guesser.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_Guesser_hpp_
+#define Hadrons_MSolver_Guesser_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/EigenPack.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+template <typename FImpl, int nBasis>
+std::shared_ptr<LinearFunction<typename FImpl::FermionField>> 
+makeGuesser(const std::string epackName)
+{
+    typedef typename FImpl::FermionField                  FermionField;
+    typedef BaseFermionEigenPack<FImpl>                   EPack;
+    typedef CoarseFermionEigenPack<FImpl, nBasis>         CoarseEPack;
+    typedef DeflatedGuesser<FermionField>                 FineGuesser;
+    typedef LocalCoherenceDeflatedGuesser<
+        FermionField, typename CoarseEPack::CoarseField>  CoarseGuesser;
+
+    std::shared_ptr<LinearFunction<typename FImpl::FermionField>> guesserPt;
+
+    DEFINE_ENV_LAMBDA;
+
+    if (epackName.empty())
+    {
+        guesserPt.reset(new ZeroGuesser<FermionField>());
+    }
+    else
+    {
+        try
+        {
+            auto &epack = envGetDerived(EPack, CoarseEPack, epackName);
+            
+            LOG(Message) << "using low-mode deflation with coarse eigenpack '"
+                         << epackName << "' (" 
+                         << epack.evecCoarse.size() << " modes)" << std::endl;
+            guesserPt.reset(new CoarseGuesser(epack.evec, epack.evecCoarse,
+                                              epack.evalCoarse));
+        }
+        catch (Exceptions::ObjectType &e)
+        {
+            auto &epack = envGet(EPack, epackName);
+
+            LOG(Message) << "using low-mode deflation with eigenpack '"
+                         << epackName << "' (" 
+                         << epack.evec.size() << " modes)" << std::endl;
+            guesserPt.reset(new FineGuesser(epack.evec, epack.eval));
+        }
+    }
+
+    return guesserPt;
+}
+
+END_MODULE_NAMESPACE
+END_HADRONS_NAMESPACE
+
+#endif
diff --git a/Hadrons/Modules/MSolver/LocalCoherenceLanczos.cc b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.cc
new file mode 100644
index 00000000..dacc871f
--- /dev/null
+++ b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.cc
@@ -0,0 +1,39 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/LocalCoherenceLanczos.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<FIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>;
+template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<ZFIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>;
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<FIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS, FIMPLF>;
+template class Grid::Hadrons::MSolver::TLocalCoherenceLanczos<ZFIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS, ZFIMPLF>;
+#endif
diff --git a/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
new file mode 100644
index 00000000..492ff39e
--- /dev/null
+++ b/Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
@@ -0,0 +1,190 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_LocalCoherenceLanczos_hpp_
+#define Hadrons_MSolver_LocalCoherenceLanczos_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/EigenPack.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                    Local coherence Lanczos eigensolver                     *
+ *****************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+class LocalCoherenceLanczosPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosPar,
+                                    std::string,   action,
+                                    bool,          doCoarse,
+                                    LanczosParams, fineParams,
+                                    LanczosParams, coarseParams,
+                                    ChebyParams,   smoother,
+                                    RealD,         coarseRelaxTol,
+                                    std::string,   blockSize,
+                                    std::string,   output,
+                                    bool,          multiFile);
+};
+
+template <typename FImpl, int nBasis, typename FImplIo = FImpl>
+class TLocalCoherenceLanczos: public Module<LocalCoherenceLanczosPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef LocalCoherenceLanczos<typename FImpl::SiteSpinor, 
+                                  typename FImpl::SiteComplex, 
+                                  nBasis>                  LCL;
+    typedef BaseFermionEigenPack<FImpl>                    BasePack;
+    typedef CoarseFermionEigenPack<FImpl, nBasis, FImplIo> CoarsePack;
+    typedef HADRONS_DEFAULT_SCHUR_OP<FMat, FermionField>   SchurFMat;
+public:
+    // constructor
+    TLocalCoherenceLanczos(const std::string name);
+    // destructor
+    virtual ~TLocalCoherenceLanczos(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LocalCoherenceLanczos, ARG(TLocalCoherenceLanczos<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+MODULE_REGISTER_TMP(ZLocalCoherenceLanczos, ARG(TLocalCoherenceLanczos<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+MODULE_REGISTER_TMP(LocalCoherenceLanczosIo32, ARG(TLocalCoherenceLanczos<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS, FIMPLF>), MSolver);
+MODULE_REGISTER_TMP(ZLocalCoherenceLanczosIo32, ARG(TLocalCoherenceLanczos<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS, ZFIMPLF>), MSolver);
+#endif
+
+/******************************************************************************
+ *                 TLocalCoherenceLanczos implementation                      *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis, typename FImplIo>
+TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::TLocalCoherenceLanczos(const std::string name)
+: Module<LocalCoherenceLanczosPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, int nBasis, typename FImplIo>
+std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::getInput(void)
+{
+    std::vector<std::string> in = {par().action};
+    
+    return in;
+}
+
+template <typename FImpl, int nBasis, typename FImplIo>
+std::vector<std::string> TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis, typename FImplIo>
+void TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::setup(void)
+{
+    LOG(Message) << "Setting up local coherence Lanczos eigensolver for"
+                 << " action '" << par().action << "' (" << nBasis
+                 << " eigenvectors)..." << std::endl;
+    
+    unsigned int Ls        = env().getObjectLs(par().action);
+    auto         blockSize = strToVec<int>(par().blockSize);
+
+    env().createCoarseGrid(blockSize, Ls);
+
+    auto cg  = env().getCoarseGrid(blockSize, Ls);
+    int  cNm = (par().doCoarse) ? par().coarseParams.Nm : 0;
+
+    LOG(Message) << "Coarse grid: " << cg->GlobalDimensions() << std::endl;
+    envCreateDerived(BasePack, CoarsePack, getName(), Ls,
+                     par().fineParams.Nm, cNm, env().getRbGrid(Ls), cg);
+
+    auto &epack = envGetDerived(BasePack, CoarsePack, getName());
+
+    envTmp(SchurFMat, "mat", Ls, envGet(FMat, par().action));
+    envGetTmp(SchurFMat, mat);
+    envTmp(LCL, "solver", Ls, env().getRbGrid(Ls), cg, mat, 
+           Odd, epack.evec, epack.evecCoarse, epack.eval, epack.evalCoarse);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis, typename FImplIo>
+void TLocalCoherenceLanczos<FImpl, nBasis, FImplIo>::execute(void)
+{
+    auto &finePar   = par().fineParams;
+    auto &coarsePar = par().coarseParams;
+    auto &epack     = envGetDerived(BasePack, CoarsePack, getName());
+
+    epack.record.operatorXml = vm().getModule(par().action)->parString();
+    epack.record.solverXml   = parString();
+    envGetTmp(LCL, solver);
+    LOG(Message) << "Performing fine grid IRL -- Nstop= " 
+                 << finePar.Nstop << ", Nk= " << finePar.Nk << ", Nm= " 
+                 << finePar.Nm << std::endl;
+    solver.calcFine(finePar.Cheby, finePar.Nstop, finePar.Nk, finePar.Nm,
+                    finePar.resid,finePar.MaxIt, finePar.betastp, 
+                    finePar.MinRes);
+    solver.testFine(finePar.resid*100.0);
+    if (!par().output.empty())
+    {
+        epack.writeFine(par().output, par().multiFile, vm().getTrajectory());
+    }
+    if (par().doCoarse)
+    {
+        LOG(Message) << "Orthogonalising" << std::endl;
+        solver.Orthogonalise();
+        LOG(Message) << "Performing coarse grid IRL -- Nstop= " 
+                    << coarsePar.Nstop << ", Nk= " << coarsePar.Nk << ", Nm= " 
+                    << coarsePar.Nm << std::endl;
+        solver.calcCoarse(coarsePar.Cheby, par().smoother, par().coarseRelaxTol,
+                          coarsePar.Nstop, coarsePar.Nk, coarsePar.Nm, 
+                          coarsePar.resid, coarsePar.MaxIt, coarsePar.betastp, 
+                          coarsePar.MinRes);
+        solver.testCoarse(coarsePar.resid*100.0, par().smoother, 
+                        par().coarseRelaxTol);
+        if (!par().output.empty())
+        {
+            epack.writeCoarse(par().output, par().multiFile, vm().getTrajectory());
+        }
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_LocalCoherenceLanczos_hpp_
diff --git a/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.cc b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.cc
new file mode 100644
index 00000000..36dc7c94
--- /dev/null
+++ b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TMixedPrecisionRBPrecCG<FIMPLF, FIMPLD, HADRONS_DEFAULT_LANCZOS_NBASIS>;
+template class Grid::Hadrons::MSolver::TMixedPrecisionRBPrecCG<ZFIMPLF, ZFIMPLD, HADRONS_DEFAULT_LANCZOS_NBASIS>;
diff --git a/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
new file mode 100644
index 00000000..67e1be19
--- /dev/null
+++ b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
@@ -0,0 +1,197 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MSolver_MixedPrecisionRBPrecCG_hpp_
+#define Hadrons_MSolver_MixedPrecisionRBPrecCG_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
+#include <Hadrons/EigenPack.hpp>
+#include <Hadrons/Modules/MSolver/Guesser.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *              Mixed precision schur red-black preconditioned CG             *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+class MixedPrecisionRBPrecCGPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(MixedPrecisionRBPrecCGPar,
+                                    std::string , innerAction,
+                                    std::string , outerAction,
+                                    unsigned int, maxInnerIteration,
+                                    unsigned int, maxOuterIteration,
+                                    double      , residual,
+                                    std::string , eigenPack);
+};
+
+template <typename FImplInner, typename FImplOuter, int nBasis>
+class TMixedPrecisionRBPrecCG: public Module<MixedPrecisionRBPrecCGPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImplInner, Inner);
+    FERM_TYPE_ALIASES(FImplOuter, Outer);
+    SOLVER_TYPE_ALIASES(FImplOuter,);
+    typedef HADRONS_DEFAULT_SCHUR_OP<FMatInner, FermionFieldInner> SchurFMatInner;
+    typedef HADRONS_DEFAULT_SCHUR_OP<FMatOuter, FermionFieldOuter> SchurFMatOuter;
+private:
+    template <typename Field>
+    class OperatorFunctionWrapper: public OperatorFunction<Field>
+    {
+    public:
+        OperatorFunctionWrapper(LinearFunction<Field> &fn): fn_(fn) {};
+        virtual ~OperatorFunctionWrapper(void) = default;
+        virtual void operator()(LinearOperatorBase<Field> &op, 
+                                const Field &in, Field &out)
+        {
+            fn_(in, out);
+        }
+    private:
+        LinearFunction<Field> &fn_;
+    };
+public:
+    // constructor
+    TMixedPrecisionRBPrecCG(const std::string name);
+    // destructor
+    virtual ~TMixedPrecisionRBPrecCG(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getReference(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(MixedPrecisionRBPrecCG, 
+    ARG(TMixedPrecisionRBPrecCG<FIMPLF, FIMPLD, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+MODULE_REGISTER_TMP(ZMixedPrecisionRBPrecCG, 
+    ARG(TMixedPrecisionRBPrecCG<ZFIMPLF, ZFIMPLD, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+
+/******************************************************************************
+ *                 TMixedPrecisionRBPrecCG implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImplInner, typename FImplOuter, int nBasis>
+TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
+::TMixedPrecisionRBPrecCG(const std::string name)
+: Module<MixedPrecisionRBPrecCGPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImplInner, typename FImplOuter, int nBasis>
+std::vector<std::string> TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
+::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImplInner, typename FImplOuter, int nBasis>
+std::vector<std::string> TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
+::getReference(void)
+{
+    std::vector<std::string> ref = {par().innerAction, par().outerAction};
+    
+    if (!par().eigenPack.empty())
+    {
+        ref.push_back(par().eigenPack);
+    }
+    
+    return ref;
+}
+
+template <typename FImplInner, typename FImplOuter, int nBasis>
+std::vector<std::string> TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
+::getOutput(void)
+{
+    std::vector<std::string> out = {getName(), getName() + "_subtract"};
+
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImplInner, typename FImplOuter, int nBasis>
+void TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
+::setup(void)
+{
+    LOG(Message) << "Setting up Schur red-black preconditioned mixed-precision "
+                 << "CG for inner/outer action '" << par().innerAction 
+                 << "'/'" << par().outerAction << "', residual "
+                 << par().residual << ", and maximum inner/outer iteration " 
+                 << par().maxInnerIteration << "/" << par().maxOuterIteration
+                 << std::endl;
+
+    auto Ls        = env().getObjectLs(par().innerAction);
+    auto &imat     = envGet(FMatInner, par().innerAction);
+    auto &omat     = envGet(FMatOuter, par().outerAction);
+    auto guesserPt = makeGuesser<FImplOuter, nBasis>(par().eigenPack);
+
+    auto makeSolver = [&imat, &omat, guesserPt, Ls, this](bool subGuess) 
+    {
+        return [&imat, &omat, guesserPt, subGuess, Ls, this]
+        (FermionFieldOuter &sol, const FermionFieldOuter &source) 
+        {
+            typedef typename FermionFieldInner::vector_type VTypeInner;
+
+            SchurFMatInner simat(imat);
+            SchurFMatOuter somat(omat);
+            MixedPrecisionConjugateGradient<FermionFieldOuter, FermionFieldInner> 
+                mpcg(par().residual, par().maxInnerIteration, 
+                     par().maxOuterIteration, 
+                     env().template getRbGrid<VTypeInner>(Ls),
+                     simat, somat);
+            OperatorFunctionWrapper<FermionFieldOuter> wmpcg(mpcg);
+            HADRONS_DEFAULT_SCHUR_SOLVE<FermionFieldOuter> schurSolver(wmpcg);
+            schurSolver.subtractGuess(subGuess);
+            schurSolver(omat, source, sol, *guesserPt);
+        };
+    };
+    auto solver = makeSolver(false);
+    envCreate(Solver, getName(), Ls, solver, omat);
+    auto solver_subtract = makeSolver(true);
+    envCreate(Solver, getName() + "_subtract", Ls, solver_subtract, omat);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImplInner, typename FImplOuter, int nBasis>
+void TMixedPrecisionRBPrecCG<FImplInner, FImplOuter, nBasis>
+::execute(void)
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_MixedPrecisionRBPrecCG_hpp_
diff --git a/Hadrons/Modules/MSolver/RBPrecCG.cc b/Hadrons/Modules/MSolver/RBPrecCG.cc
new file mode 100644
index 00000000..6d26532b
--- /dev/null
+++ b/Hadrons/Modules/MSolver/RBPrecCG.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/RBPrecCG.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSolver;
+
+template class Grid::Hadrons::MSolver::TRBPrecCG<FIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>;
+template class Grid::Hadrons::MSolver::TRBPrecCG<ZFIMPL,HADRONS_DEFAULT_LANCZOS_NBASIS>;
+
diff --git a/Hadrons/Modules/MSolver/RBPrecCG.hpp b/Hadrons/Modules/MSolver/RBPrecCG.hpp
new file mode 100644
index 00000000..8c83c6a1
--- /dev/null
+++ b/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -0,0 +1,164 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSolver/RBPrecCG.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: fionnoh <fionnoh@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#ifndef Hadrons_MSolver_RBPrecCG_hpp_
+#define Hadrons_MSolver_RBPrecCG_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Solver.hpp>
+#include <Hadrons/EigenPack.hpp>
+#include <Hadrons/Modules/MSolver/Guesser.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                     Schur red-black preconditioned CG                      *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSolver)
+
+class RBPrecCGPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RBPrecCGPar ,
+                                    std::string , action,
+                                    unsigned int, maxIteration,
+                                    double      , residual,
+                                    std::string , eigenPack);
+};
+
+template <typename FImpl, int nBasis>
+class TRBPrecCG: public Module<RBPrecCGPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    SOLVER_TYPE_ALIASES(FImpl,);
+public:
+    // constructor
+    TRBPrecCG(const std::string name);
+    // destructor
+    virtual ~TRBPrecCG(void) {};
+    // dependencies/products
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getReference(void);
+    virtual std::vector<std::string> getOutput(void);
+protected:
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(RBPrecCG, ARG(TRBPrecCG<FIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+MODULE_REGISTER_TMP(ZRBPrecCG, ARG(TRBPrecCG<ZFIMPL, HADRONS_DEFAULT_LANCZOS_NBASIS>), MSolver);
+
+/******************************************************************************
+ *                      TRBPrecCG template implementation                     *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+TRBPrecCG<FImpl, nBasis>::TRBPrecCG(const std::string name)
+: Module(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+std::vector<std::string> TRBPrecCG<FImpl, nBasis>::getInput(void)
+{
+    std::vector<std::string> in = {};
+    
+    return in;
+}
+
+template <typename FImpl, int nBasis>
+std::vector<std::string> TRBPrecCG<FImpl, nBasis>::getReference(void)
+{
+    std::vector<std::string> ref = {par().action};
+    
+    if (!par().eigenPack.empty())
+    {
+        ref.push_back(par().eigenPack);
+    }
+
+    return ref;
+}
+
+template <typename FImpl, int nBasis>
+std::vector<std::string> TRBPrecCG<FImpl, nBasis>::getOutput(void)
+{
+    std::vector<std::string> out = {getName(), getName() + "_subtract"};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+void TRBPrecCG<FImpl, nBasis>::setup(void)
+{
+    if (par().maxIteration == 0)
+    {
+        HADRONS_ERROR(Argument, "zero maximum iteration");
+    }
+
+    LOG(Message) << "setting up Schur red-black preconditioned CG for"
+                 << " action '" << par().action << "' with residual "
+                 << par().residual << ", maximum iteration " 
+                 << par().maxIteration << std::endl;
+
+    auto Ls        = env().getObjectLs(par().action);
+    auto &mat      = envGet(FMat, par().action);
+    auto guesserPt = makeGuesser<FImpl, nBasis>(par().eigenPack);
+
+    auto makeSolver = [&mat, guesserPt, this](bool subGuess) {
+        return [&mat, guesserPt, subGuess, this](FermionField &sol,
+                                     const FermionField &source) {
+            ConjugateGradient<FermionField> cg(par().residual,
+                                               par().maxIteration);
+            HADRONS_DEFAULT_SCHUR_SOLVE<FermionField> schurSolver(cg);
+            schurSolver.subtractGuess(subGuess);
+            schurSolver(mat, source, sol, *guesserPt);
+        };
+    };
+    auto solver = makeSolver(false);
+    envCreate(Solver, getName(), Ls, solver, mat);
+    auto solver_subtract = makeSolver(true);
+    envCreate(Solver, getName() + "_subtract", Ls, solver_subtract, mat);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl, int nBasis>
+void TRBPrecCG<FImpl, nBasis>::execute(void)
+{}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSolver_RBPrecCG_hpp_
diff --git a/Hadrons/Modules/MSource/Momentum.cc b/Hadrons/Modules/MSource/Momentum.cc
new file mode 100644
index 00000000..9bcf65ae
--- /dev/null
+++ b/Hadrons/Modules/MSource/Momentum.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/Momentum.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSource/Momentum.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TMomentum<FIMPL>;
+
diff --git a/Hadrons/Modules/MSource/Momentum.hpp b/Hadrons/Modules/MSource/Momentum.hpp
new file mode 100644
index 00000000..42cda57a
--- /dev/null
+++ b/Hadrons/Modules/MSource/Momentum.hpp
@@ -0,0 +1,149 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/Momentum.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_Momentum_hpp_
+#define Hadrons_Momentum_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/* 
+Plane Wave source
+-----------------
+src_x = e^i2pi/L * p *position
+*/
+
+/******************************************************************************
+ *                          Plane Wave source                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSource)
+
+class MomentumPar: Serializable
+{
+public:
+//What is meant by serializable in this context
+GRID_SERIALIZABLE_CLASS_MEMBERS(MomentumPar,
+std::string, mom);
+};
+
+
+template <typename FImpl>
+class TMomentum: public Module<MomentumPar>
+{
+public:
+FERM_TYPE_ALIASES(FImpl,);
+public:
+// constructor
+TMomentum(const std::string name);
+// destructor
+virtual ~TMomentum(void) {};
+// dependency relation
+virtual std::vector<std::string> getInput(void);
+virtual std::vector<std::string> getOutput(void);
+// setup
+virtual void setup(void);
+// execution
+virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(Momentum, TMomentum<FIMPL>, MSource);
+//MODULE_REGISTER_NS(Momentum, TMomentum, MSource);
+
+/******************************************************************************
+*                       TMomentum template implementation                     *
+******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TMomentum<FImpl>::TMomentum(const std::string name)
+: Module<MomentumPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TMomentum<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TMomentum<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    return out;
+}
+
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TMomentum<FImpl>::setup(void)
+{
+    envCreateLat(PropagatorField, getName());
+}
+
+
+//execution//////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TMomentum<FImpl>::execute(void)
+{
+    LOG(Message) << "Generating planewave momentum source with momentum " << par().mom << std::endl;
+    //what does this env do?
+    PropagatorField &src = envGet(PropagatorField, getName());
+    Lattice<iScalar<vInteger>> t(env().getGrid());
+    LatticeComplex             C(env().getGrid()), coor(env().getGrid());
+    std::vector<Real>          p;
+    std::vector<Real> latt_size(GridDefaultLatt().begin(), GridDefaultLatt().end()); 
+    Complex                    i(0.0,1.0);
+
+    LOG(Message) << " " << std::endl;
+    //get the momentum from parameters
+    p  = strToVec<Real>(par().mom);
+    C = Zero();
+    LOG(Message) << "momentum converted from string - " << std::to_string(p[0]) <<std::to_string(p[1]) <<std::to_string(p[2]) <<   std::to_string(p[3]) << std::endl;
+    for(int mu=0;mu<4;mu++){
+    Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C +(TwoPiL * p[mu]) * coor;
+    }
+    C = exp(C*i);
+    LOG(Message) << "exponential of pdotx taken " << std::endl;
+    src = src + C;
+    LOG(Message) << "source created" << std::endl;
+
+}
+
+
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Momentum_hpp_
diff --git a/Hadrons/Modules/MSource/Point.cc b/Hadrons/Modules/MSource/Point.cc
new file mode 100644
index 00000000..43cea943
--- /dev/null
+++ b/Hadrons/Modules/MSource/Point.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/Point.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSource/Point.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TPoint<FIMPL>;
+template class Grid::Hadrons::MSource::TPoint<ScalarImplCR>;
+
diff --git a/extras/Hadrons/Modules/MSource/Point.hpp b/Hadrons/Modules/MSource/Point.hpp
similarity index 85%
rename from extras/Hadrons/Modules/MSource/Point.hpp
rename to Hadrons/Modules/MSource/Point.hpp
index 4eb8b05a..0d81b34b 100644
--- a/extras/Hadrons/Modules/MSource/Point.hpp
+++ b/Hadrons/Modules/MSource/Point.hpp
@@ -2,12 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSource/Point.hpp
+Source file: Hadrons/Modules/MSource/Point.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSource_Point_hpp_
 #define Hadrons_MSource_Point_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -63,12 +63,12 @@ template <typename FImpl>
 class TPoint: public Module<PointPar>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TPoint(const std::string name);
     // destructor
-    virtual ~TPoint(void) = default;
+    virtual ~TPoint(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -79,8 +79,8 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSource);
-MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource);
+MODULE_REGISTER_TMP(Point,       TPoint<FIMPL>,        MSource);
+MODULE_REGISTER_TMP(ScalarPoint, TPoint<ScalarImplCR>, MSource);
 
 /******************************************************************************
  *                       TPoint template implementation                       *
@@ -126,6 +126,11 @@ void TPoint<FImpl>::execute(void)
     auto             &src     = envGet(PropagatorField, getName());
     SitePropagator   id;
     
+    if (position.size() != env().getNd())
+    {
+        HADRONS_ERROR(Size, "position has " + std::to_string(position.size())
+                      + " components (must have " + std::to_string(env().getNd()) + ")");
+    }
     id  = 1.;
     src = Zero();
     pokeSite(id, src, position);
diff --git a/Hadrons/Modules/MSource/SeqConserved.cc b/Hadrons/Modules/MSource/SeqConserved.cc
new file mode 100644
index 00000000..1802993b
--- /dev/null
+++ b/Hadrons/Modules/MSource/SeqConserved.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/SeqConserved.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSource/SeqConserved.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TSeqConserved<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MSource/SeqConserved.hpp b/Hadrons/Modules/MSource/SeqConserved.hpp
similarity index 55%
rename from extras/Hadrons/Modules/MSource/SeqConserved.hpp
rename to Hadrons/Modules/MSource/SeqConserved.hpp
index 3e8ef457..36715aff 100644
--- a/extras/Hadrons/Modules/MSource/SeqConserved.hpp
+++ b/Hadrons/Modules/MSource/SeqConserved.hpp
@@ -2,11 +2,13 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MContraction/SeqConserved.hpp
+Source file: Hadrons/Modules/MSource/SeqConserved.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
+Author: Vera Guelpers <vmg1n14@soton.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,17 +31,19 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSource_SeqConserved_hpp_
 #define Hadrons_MSource_SeqConserved_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
 /*
  
- Sequential source
+ Sequential source with insertion of conserved current. 
+ Additionally optional insertion of a photon field A_\mu(x).
  -----------------------------
- * src_x = q_x * theta(x_3 - tA) * theta(tB - x_3) * J_mu * exp(i x.mom)
+ * src_x = sum_{mu=mu_min}^{mu_max} 
+     q_x * theta(x_3 - tA) * theta(tB - x_3) * J_mu * exp(i x.mom) (* A_\mu(x))
  
  * options:
  - q: input propagator (string)
@@ -47,8 +51,10 @@ BEGIN_HADRONS_NAMESPACE
  - tA: begin timeslice (integer)
  - tB: end timesilce (integer)
  - curr_type: type of conserved current to insert (Current)
- - mu: Lorentz index of current to insert (integer)
+ - mu_min: begin Lorentz Index (integer)
+ - mu_max: end Lorentz Index (integer)
  - mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.")
+ - photon: optional photon field (string)
  
  */
 
@@ -66,8 +72,10 @@ public:
                                     unsigned int, tA,
                                     unsigned int, tB,
                                     Current,      curr_type,
-                                    unsigned int, mu,
-                                    std::string,  mom);
+                                    unsigned int, mu_min,
+                                    unsigned int, mu_max,
+                                    std::string,  mom,
+                                    std::string,  photon);
 };
 
 template <typename FImpl>
@@ -75,11 +83,13 @@ class TSeqConserved: public Module<SeqConservedPar>
 {
 public:
     FERM_TYPE_ALIASES(FImpl,);
+public:
+    typedef PhotonR::GaugeField     EmField;
 public:
     // constructor
     TSeqConserved(const std::string name);
     // destructor
-    virtual ~TSeqConserved(void) = default;
+    virtual ~TSeqConserved(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -88,9 +98,13 @@ protected:
     virtual void setup(void);
     // execution
     virtual void execute(void);
+private:
+    bool        SeqhasPhase_{false}; 
+    std::string SeqmomphName_;
 };
 
-MODULE_REGISTER_NS(SeqConserved, TSeqConserved<FIMPL>, MSource);
+MODULE_REGISTER_TMP(SeqConserved, TSeqConserved<FIMPL>, MSource);
+
 
 /******************************************************************************
  *                      TSeqConserved implementation                          *
@@ -99,6 +113,7 @@ MODULE_REGISTER_NS(SeqConserved, TSeqConserved<FIMPL>, MSource);
 template <typename FImpl>
 TSeqConserved<FImpl>::TSeqConserved(const std::string name)
 : Module<SeqConservedPar>(name)
+, SeqmomphName_ (name + "_Seqmomph")
 {}
 
 // dependencies/products ///////////////////////////////////////////////////////
@@ -106,7 +121,8 @@ template <typename FImpl>
 std::vector<std::string> TSeqConserved<FImpl>::getInput(void)
 {
     std::vector<std::string> in = {par().q, par().action};
-    
+    if (!par().photon.empty()) in.push_back(par().photon);
+        
     return in;
 }
 
@@ -115,7 +131,7 @@ std::vector<std::string> TSeqConserved<FImpl>::getOutput(void)
 {
     std::vector<std::string> out = {getName()};
     
-    return out;
+   return out;
 }
 
 // setup ///////////////////////////////////////////////////////////////////////
@@ -124,6 +140,10 @@ void TSeqConserved<FImpl>::setup(void)
 {
     auto Ls_ = env().getObjectLs(par().action);
     envCreateLat(PropagatorField, getName(), Ls_);
+    envTmpLat(PropagatorField, "src_tmp");
+    envCacheLat(LatticeComplex, SeqmomphName_);
+    envTmpLat(LatticeComplex, "coor");
+    envTmpLat(LatticeComplex, "latt_compl");
 }
 
 // execution ///////////////////////////////////////////////////////////////////
@@ -133,27 +153,79 @@ void TSeqConserved<FImpl>::execute(void)
     if (par().tA == par().tB)
     {
         LOG(Message) << "Generating sequential source with conserved "
-                     << par().curr_type << " current insertion (mu = " 
-                     << par().mu << ") at " << "t = " << par().tA << std::endl;
+                     << par().curr_type << " current at " 
+		     << "t = " << par().tA << " summed over the indices " 
+		     << par().mu_min << " <= mu <= " << par().mu_max 
+		     << std::endl;
     }
     else
     {
         LOG(Message) << "Generating sequential source with conserved "
-                     << par().curr_type << " current insertion (mu = " 
-                     << par().mu << ") for " << par().tA << " <= t <= " 
-                     << par().tB << std::endl;
+                     << par().curr_type << " current for " 
+                     << par().tA << " <= t <= " 
+                     << par().tB << " summed over the indices " 
+		     << par().mu_min << " <= mu <= " << par().mu_max
+	             << std::endl;
     }
     auto &src = envGet(PropagatorField, getName());
+    envGetTmp(PropagatorField, src_tmp);
+    src_tmp = src;
     auto &q   = envGet(PropagatorField, par().q);
     auto &mat = envGet(FMat, par().action);
+    envGetTmp(LatticeComplex, latt_compl);
 
-    std::vector<Real> mom = strToVec<Real>(par().mom);
-    mat.SeqConservedCurrent(q, src, par().curr_type, par().mu, 
-                            mom, par().tA, par().tB);
+    src = Zero();
+
+    //exp(ipx)
+    auto &mom_phase = envGet(LatticeComplex, SeqmomphName_);
+    if (!SeqhasPhase_)
+    {    
+        std::vector<Real> mom = strToVec<Real>(par().mom);
+        mom_phase = Zero();
+        Complex           i(0.0,1.0);
+        envGetTmp(LatticeComplex, coor);
+        for(unsigned int mu = 0; mu < env().getNd(); mu++)
+        {
+            LatticeCoordinate(coor, mu);
+            mom_phase = mom_phase + (mom[mu]/env().getDim(mu))*coor;
+        }
+        mom_phase = exp((Real)(2*M_PI)*i*mom_phase);
+        SeqhasPhase_ = true;
+    }
+    LOG(Message) << "Inserting momentum " << strToVec<Real>(par().mom) << std::endl;
+
+
+
+    if (!par().photon.empty())    	
+    {
+	 LOG(Message) << "Inserting the stochastic photon field " << par().photon << std::endl;
+    }
+
+    for(unsigned int mu=par().mu_min;mu<=par().mu_max;mu++)
+    {
+        if (!par().photon.empty())    	
+        {
+	    //Get the stochastic photon field, if required
+            auto &stoch_photon = envGet(EmField,  par().photon);
+    	    latt_compl =  PeekIndex<LorentzIndex>(stoch_photon, mu) * mom_phase;
+        }
+        else
+        {
+            latt_compl = mom_phase;
+        } 
+
+    	mat.SeqConservedCurrent(q, src_tmp, par().curr_type, mu, 
+                             par().tA, par().tB, latt_compl);
+	src += src_tmp;
+
+    }	
+
+ 
 }
 
+
 END_MODULE_NAMESPACE
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_SeqConserved_hpp_
+#endif // Hadrons_MSource_SeqConserved_hpp_
diff --git a/Hadrons/Modules/MSource/SeqGamma.cc b/Hadrons/Modules/MSource/SeqGamma.cc
new file mode 100644
index 00000000..64d31478
--- /dev/null
+++ b/Hadrons/Modules/MSource/SeqGamma.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/SeqGamma.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSource/SeqGamma.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TSeqGamma<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MSource/SeqGamma.hpp b/Hadrons/Modules/MSource/SeqGamma.hpp
similarity index 91%
rename from extras/Hadrons/Modules/MSource/SeqGamma.hpp
rename to Hadrons/Modules/MSource/SeqGamma.hpp
index 8ff9771e..b8278142 100644
--- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp
+++ b/Hadrons/Modules/MSource/SeqGamma.hpp
@@ -2,13 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSource/SeqGamma.hpp
+Source file: Hadrons/Modules/MSource/SeqGamma.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -31,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSource_SeqGamma_hpp_
 #define Hadrons_MSource_SeqGamma_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -72,12 +71,12 @@ template <typename FImpl>
 class TSeqGamma: public Module<SeqGammaPar>
 {
 public:
-    FGS_TYPE_ALIASES(FImpl,);
+    FERM_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TSeqGamma(const std::string name);
     // destructor
-    virtual ~TSeqGamma(void) = default;
+    virtual ~TSeqGamma(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -91,7 +90,7 @@ private:
     std::string momphName_, tName_;
 };
 
-MODULE_REGISTER_NS(SeqGamma, TSeqGamma<FIMPL>, MSource);
+MODULE_REGISTER_TMP(SeqGamma, TSeqGamma<FIMPL>, MSource);
 
 /******************************************************************************
  *                         TSeqGamma implementation                           *
@@ -126,7 +125,7 @@ template <typename FImpl>
 void TSeqGamma<FImpl>::setup(void)
 {
     envCreateLat(PropagatorField, getName());
-    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
+    envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
     envCacheLat(LatticeComplex, momphName_);
     envTmpLat(LatticeComplex, "coor");
 }
@@ -163,7 +162,7 @@ void TSeqGamma<FImpl>::execute(void)
         for(unsigned int mu = 0; mu < env().getNd(); mu++)
         {
             LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
         }
         ph = exp((Real)(2*M_PI)*i*ph);
         LatticeCoordinate(t, Tp);
diff --git a/Hadrons/Modules/MSource/Wall.cc b/Hadrons/Modules/MSource/Wall.cc
new file mode 100644
index 00000000..dbc46293
--- /dev/null
+++ b/Hadrons/Modules/MSource/Wall.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/Wall.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSource/Wall.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TWall<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MSource/Wall.hpp b/Hadrons/Modules/MSource/Wall.hpp
similarity index 91%
rename from extras/Hadrons/Modules/MSource/Wall.hpp
rename to Hadrons/Modules/MSource/Wall.hpp
index 34058b7a..9ec0f391 100644
--- a/extras/Hadrons/Modules/MSource/Wall.hpp
+++ b/Hadrons/Modules/MSource/Wall.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSource/Wall.hpp
+Source file: Hadrons/Modules/MSource/Wall.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSource_WallSource_hpp_
 #define Hadrons_MSource_WallSource_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -69,7 +70,7 @@ public:
     // constructor
     TWall(const std::string name);
     // destructor
-    virtual ~TWall(void) = default;
+    virtual ~TWall(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -83,7 +84,7 @@ private:
     std::string momphName_, tName_;
 };
 
-MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource);
+MODULE_REGISTER_TMP(Wall, TWall<FIMPL>, MSource);
 
 /******************************************************************************
  *                 TWall implementation                                       *
@@ -142,7 +143,7 @@ void TWall<FImpl>::execute(void)
         for(unsigned int mu = 0; mu < env().getNd(); mu++)
         {
             LatticeCoordinate(coor, mu);
-            ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
+            ph = ph + (p[mu]/env().getDim(mu))*coor;
         }
         ph = exp((Real)(2*M_PI)*i*ph);
         LatticeCoordinate(t, Tp);
diff --git a/Hadrons/Modules/MSource/Z2.cc b/Hadrons/Modules/MSource/Z2.cc
new file mode 100644
index 00000000..2fc95532
--- /dev/null
+++ b/Hadrons/Modules/MSource/Z2.cc
@@ -0,0 +1,36 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MSource/Z2.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MSource/Z2.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TZ2<FIMPL>;
+template class Grid::Hadrons::MSource::TZ2<ScalarImplCR>;
+
diff --git a/extras/Hadrons/Modules/MSource/Z2.hpp b/Hadrons/Modules/MSource/Z2.hpp
similarity index 90%
rename from extras/Hadrons/Modules/MSource/Z2.hpp
rename to Hadrons/Modules/MSource/Z2.hpp
index 3593cb34..e16114e0 100644
--- a/extras/Hadrons/Modules/MSource/Z2.hpp
+++ b/Hadrons/Modules/MSource/Z2.hpp
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MSource/Z2.hpp
+Source file: Hadrons/Modules/MSource/Z2.hpp
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -30,9 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MSource_Z2_hpp_
 #define Hadrons_MSource_Z2_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -67,12 +66,12 @@ template <typename FImpl>
 class TZ2: public Module<Z2Par>
 {
 public:
-    FERM_TYPE_ALIASES(FImpl,);
+    BASIC_TYPE_ALIASES(FImpl,);
 public:
     // constructor
     TZ2(const std::string name);
     // destructor
-    virtual ~TZ2(void) = default;
+    virtual ~TZ2(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -86,8 +85,8 @@ private:
     std::string tName_;
 };
 
-MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource);
-MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
+MODULE_REGISTER_TMP(Z2,       TZ2<FIMPL>,        MSource);
+MODULE_REGISTER_TMP(ScalarZ2, TZ2<ScalarImplCR>, MSource);
 
 /******************************************************************************
  *                       TZ2 template implementation                          *
@@ -121,7 +120,7 @@ template <typename FImpl>
 void TZ2<FImpl>::setup(void)
 {
     envCreateLat(PropagatorField, getName());
-    envCacheLat(Lattice<iScalar<vInteger>>, tName_);
+    envCache(Lattice<iScalar<vInteger>>, tName_, 1, envGetGrid(LatticeComplex));
     envTmpLat(LatticeComplex, "eta");
 }
 
@@ -150,7 +149,7 @@ void TZ2<FImpl>::execute(void)
         hasT_ = true;
     }
     envGetTmp(LatticeComplex, eta);
-    bernoulli(*env().get4dRng(), eta);
+    bernoulli(rng4d(), eta);
     eta = (2.*eta - shift)*(1./::sqrt(2.));
     eta = where((t >= par().tA) and (t <= par().tB), eta, 0.*eta);
     src = 1.;
diff --git a/Hadrons/Modules/MUtilities/PrecisionCast.cc b/Hadrons/Modules/MUtilities/PrecisionCast.cc
new file mode 100644
index 00000000..556ededc
--- /dev/null
+++ b/Hadrons/Modules/MUtilities/PrecisionCast.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/PrecisionCast.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MUtilities;
+
+template class Grid::Hadrons::MUtilities::TPrecisionCast<GIMPLD::GaugeField, GIMPLF::GaugeField>;
+template class Grid::Hadrons::MUtilities::TPrecisionCast<FIMPLD::FermionField, FIMPLF::FermionField>;
diff --git a/Hadrons/Modules/MUtilities/PrecisionCast.hpp b/Hadrons/Modules/MUtilities/PrecisionCast.hpp
new file mode 100644
index 00000000..39a5ff73
--- /dev/null
+++ b/Hadrons/Modules/MUtilities/PrecisionCast.hpp
@@ -0,0 +1,124 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/PrecisionCast.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MUtilities_PrecisionCast_hpp_
+#define Hadrons_MUtilities_PrecisionCast_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                          Precision cast module                             *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MUtilities)
+
+class PrecisionCastPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(PrecisionCastPar,
+                                    std::string, field);
+};
+
+template <typename FieldIn, typename FieldOut>
+class TPrecisionCast: public Module<PrecisionCastPar>
+{
+public:
+    // constructor
+    TPrecisionCast(const std::string name);
+    // destructor
+    virtual ~TPrecisionCast(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(GaugeSinglePrecisionCast, 
+                    ARG(TPrecisionCast<GIMPLD::GaugeField, GIMPLF::GaugeField>),
+                    MUtilities);
+MODULE_REGISTER_TMP(FermionSinglePrecisionCast, 
+                    ARG(TPrecisionCast<FIMPLD::FermionField, FIMPLF::FermionField>),
+                    MUtilities);
+
+/******************************************************************************
+ *                     TPrecisionCast implementation                          *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+TPrecisionCast<FieldIn, FieldOut>::TPrecisionCast(const std::string name)
+: Module<PrecisionCastPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+std::vector<std::string> TPrecisionCast<FieldIn, FieldOut>::getInput(void)
+{
+    std::vector<std::string> in = {par().field};
+    
+    return in;
+}
+
+template <typename FieldIn, typename FieldOut>
+std::vector<std::string> TPrecisionCast<FieldIn, FieldOut>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+void TPrecisionCast<FieldIn, FieldOut>::setup(void)
+{
+    envCreateLat(FieldOut, getName());
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FieldIn, typename FieldOut>
+void TPrecisionCast<FieldIn, FieldOut>::execute(void)
+{
+    LOG(Message) << "Casting field '" << par().field << "'" << std::endl;
+    LOG(Message) << "In  type: " << typeName<FieldIn>() << std::endl;
+    LOG(Message) << "Out type: " << typeName<FieldOut>() << std::endl;
+
+    auto &in  = envGet(FieldIn,  par().field);
+    auto &out = envGet(FieldOut, getName());
+
+    precisionChange(out, in);
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MUtilities_PrecisionCast_hpp_
diff --git a/Hadrons/Modules/MUtilities/RandomVectors.cc b/Hadrons/Modules/MUtilities/RandomVectors.cc
new file mode 100644
index 00000000..eb3b9cad
--- /dev/null
+++ b/Hadrons/Modules/MUtilities/RandomVectors.cc
@@ -0,0 +1,34 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/RandomVectors.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MUtilities;
+
+template class Grid::Hadrons::MUtilities::TRandomVectors<FIMPL::FermionField>;
diff --git a/Hadrons/Modules/MUtilities/RandomVectors.hpp b/Hadrons/Modules/MUtilities/RandomVectors.hpp
new file mode 100644
index 00000000..65126eb7
--- /dev/null
+++ b/Hadrons/Modules/MUtilities/RandomVectors.hpp
@@ -0,0 +1,128 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/RandomVectors.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_MUtilities_RandomVectors_hpp_
+#define Hadrons_MUtilities_RandomVectors_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *           Module generating random lattices for testing purposes           *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MUtilities)
+
+class RandomVectorsPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(RandomVectorsPar,
+                                    unsigned int, size,
+                                    unsigned int, Ls);
+};
+
+template <typename Field>
+class TRandomVectors: public Module<RandomVectorsPar>
+{
+public:
+    // constructor
+    TRandomVectors(const std::string name);
+    // destructor
+    virtual ~TRandomVectors(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(RandomFermions, TRandomVectors<FIMPL::FermionField>, MUtilities);
+
+/******************************************************************************
+ *                      TRandomVectors implementation                         *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename Field>
+TRandomVectors<Field>::TRandomVectors(const std::string name)
+: Module<RandomVectorsPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename Field>
+std::vector<std::string> TRandomVectors<Field>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename Field>
+std::vector<std::string> TRandomVectors<Field>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename Field>
+void TRandomVectors<Field>::setup(void)
+{
+    if (par().Ls > 1)
+    {
+        envCreate(std::vector<Field>, getName(), par().Ls, par().size, 
+                  envGetGrid(Field, par().Ls));
+    }
+    else
+    {
+        envCreate(std::vector<Field>, getName(), 1, par().size, envGetGrid(Field));
+    }
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename Field>
+void TRandomVectors<Field>::execute(void)
+{
+    LOG(Message) << "Generating " << par().size << " random vectors" << std::endl;
+
+    auto &vec = envGet(std::vector<Field>, getName());
+    
+    for (unsigned int i = 0; i < vec.size(); ++i)
+    {
+        random(rng4d(), vec[i]);
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MUtilities_RandomVectors_hpp_
diff --git a/Hadrons/Modules/MUtilities/TestSeqConserved.cc b/Hadrons/Modules/MUtilities/TestSeqConserved.cc
new file mode 100644
index 00000000..e15eea2a
--- /dev/null
+++ b/Hadrons/Modules/MUtilities/TestSeqConserved.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/TestSeqConserved.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MUtilities;
+
+template class Grid::Hadrons::MUtilities::TTestSeqConserved<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp b/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
similarity index 92%
rename from extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
rename to Hadrons/Modules/MUtilities/TestSeqConserved.hpp
index 0647884c..c1c84105 100644
--- a/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
+++ b/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
+Source file: Hadrons/Modules/MUtilities/TestSeqConserved.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MUtilities_TestSeqConserved_hpp_
 #define Hadrons_MUtilities_TestSeqConserved_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -76,7 +77,7 @@ public:
     // constructor
     TTestSeqConserved(const std::string name);
     // destructor
-    virtual ~TTestSeqConserved(void) = default;
+    virtual ~TTestSeqConserved(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -87,7 +88,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(TestSeqConserved, TTestSeqConserved<FIMPL>, MUtilities);
+MODULE_REGISTER_TMP(TestSeqConserved, TTestSeqConserved<FIMPL>, MUtilities);
 
 /******************************************************************************
  *                     TTestSeqConserved implementation                       *
@@ -122,7 +123,7 @@ void TTestSeqConserved<FImpl>::setup(void)
     auto Ls = env().getObjectLs(par().q);
     if (Ls != env().getObjectLs(par().action))
     {
-        HADRON_ERROR(Size, "Ls mismatch between quark action and propagator");
+        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
     }
     envTmpLat(PropagatorField, "tmp");
     envTmpLat(LatticeComplex, "c");
diff --git a/Hadrons/Modules/MUtilities/TestSeqGamma.cc b/Hadrons/Modules/MUtilities/TestSeqGamma.cc
new file mode 100644
index 00000000..db9c824d
--- /dev/null
+++ b/Hadrons/Modules/MUtilities/TestSeqGamma.cc
@@ -0,0 +1,35 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Modules/MUtilities/TestSeqGamma.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MUtilities;
+
+template class Grid::Hadrons::MUtilities::TTestSeqGamma<FIMPL>;
+
diff --git a/extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp b/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
similarity index 92%
rename from extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
rename to Hadrons/Modules/MUtilities/TestSeqGamma.hpp
index fd53eab8..728dda05 100644
--- a/extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
+++ b/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
@@ -2,11 +2,12 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Modules/MUtilities/TestSeqGamma.hpp
+Source file: Hadrons/Modules/MUtilities/TestSeqGamma.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
-Author: Andrew Lawson    <andrew.lawson1991@gmail.com>
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Lanny91 <andrew.lawson@gmail.com>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -29,9 +30,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_MUtilities_TestSeqGamma_hpp_
 #define Hadrons_MUtilities_TestSeqGamma_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -60,7 +61,7 @@ public:
     // constructor
     TTestSeqGamma(const std::string name);
     // destructor
-    virtual ~TTestSeqGamma(void) = default;
+    virtual ~TTestSeqGamma(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -71,7 +72,7 @@ protected:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(TestSeqGamma, TTestSeqGamma<FIMPL>, MUtilities);
+MODULE_REGISTER_TMP(TestSeqGamma, TTestSeqGamma<FIMPL>, MUtilities);
 
 /******************************************************************************
  *                      TTestSeqGamma implementation                          *
diff --git a/extras/Hadrons/Modules/templates/Module_in_NS.cc.template b/Hadrons/Modules/templates/Module_in_NS.cc.template
similarity index 93%
rename from extras/Hadrons/Modules/templates/Module_in_NS.cc.template
rename to Hadrons/Modules/templates/Module_in_NS.cc.template
index 8b2a0ec0..477148e0 100644
--- a/extras/Hadrons/Modules/templates/Module_in_NS.cc.template
+++ b/Hadrons/Modules/templates/Module_in_NS.cc.template
@@ -1,4 +1,4 @@
-#include <Grid/Hadrons/Modules/___NAMESPACE___/___FILEBASENAME___.hpp>
+#include <Hadrons/Modules/___NAMESPACE___/___FILEBASENAME___.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
diff --git a/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template b/Hadrons/Modules/templates/Module_in_NS.hpp.template
similarity index 81%
rename from extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
rename to Hadrons/Modules/templates/Module_in_NS.hpp.template
index ea77b12a..982babda 100644
--- a/extras/Hadrons/Modules/templates/Module_in_NS.hpp.template
+++ b/Hadrons/Modules/templates/Module_in_NS.hpp.template
@@ -1,9 +1,9 @@
 #ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 #define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -25,7 +25,7 @@ public:
     // constructor
     T___FILEBASENAME___(const std::string name);
     // destructor
-    virtual ~T___FILEBASENAME___(void) = default;
+    virtual ~T___FILEBASENAME___(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -35,7 +35,7 @@ public:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(___FILEBASENAME___, T___FILEBASENAME___, ___NAMESPACE___);
+MODULE_REGISTER(___FILEBASENAME___, T___FILEBASENAME___, ___NAMESPACE___);
 
 END_MODULE_NAMESPACE
 
diff --git a/Hadrons/Modules/templates/Module_tmp_in_NS.cc.template b/Hadrons/Modules/templates/Module_tmp_in_NS.cc.template
new file mode 100644
index 00000000..d8a19618
--- /dev/null
+++ b/Hadrons/Modules/templates/Module_tmp_in_NS.cc.template
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/___NAMESPACE___/___FILEBASENAME___.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace ___NAMESPACE___;
+
+template class Grid::Hadrons::___NAMESPACE___::T___FILEBASENAME___<FIMPL>;
diff --git a/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template b/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
similarity index 90%
rename from extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
rename to Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
index b79c0ad3..da5bc370 100644
--- a/extras/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
+++ b/Hadrons/Modules/templates/Module_tmp_in_NS.hpp.template
@@ -1,9 +1,9 @@
 #ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 #define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -26,7 +26,7 @@ public:
     // constructor
     T___FILEBASENAME___(const std::string name);
     // destructor
-    virtual ~T___FILEBASENAME___(void) = default;
+    virtual ~T___FILEBASENAME___(void) {};
     // dependency relation
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
@@ -36,7 +36,7 @@ public:
     virtual void execute(void);
 };
 
-MODULE_REGISTER_NS(___FILEBASENAME___, T___FILEBASENAME___<FIMPL>, ___NAMESPACE___);
+MODULE_REGISTER_TMP(___FILEBASENAME___, T___FILEBASENAME___<FIMPL>, ___NAMESPACE___);
 
 /******************************************************************************
  *                 T___FILEBASENAME___ implementation                             *
diff --git a/extras/Hadrons/Exceptions.hpp b/Hadrons/Solver.hpp
similarity index 51%
rename from extras/Hadrons/Exceptions.hpp
rename to Hadrons/Solver.hpp
index 8f04ab41..adba9665 100644
--- a/extras/Hadrons/Exceptions.hpp
+++ b/Hadrons/Solver.hpp
@@ -2,9 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/Exceptions.hpp
+Source file: Hadrons/Solver.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -25,48 +25,38 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#ifndef Hadrons_Solver_hpp_
+#define Hadrons_Solver_hpp_
 
-#ifndef Hadrons_Exceptions_hpp_
-#define Hadrons_Exceptions_hpp_
-
-#include <stdexcept>
-#ifndef Hadrons_Global_hpp_
-#include <Grid/Hadrons/Global.hpp>
-#endif
-
-#define SRC_LOC std::string(__FUNCTION__) + " at " + std::string(__FILE__) + ":"\
-                + std::to_string(__LINE__)
-#define HADRON_ERROR(exc, msg)\
-LOG(Error) << msg << std::endl;\
-throw(Exceptions::exc(msg, SRC_LOC));
-
-#define DECL_EXC(name, base) \
-class name: public base\
-{\
-public:\
-    name(std::string msg, std::string loc);\
-}
+#include <Hadrons/Global.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
-namespace Exceptions
+template <typename FImpl>
+class Solver
 {
-    // logic errors
-    DECL_EXC(Logic, std::logic_error);
-    DECL_EXC(Definition, Logic);
-    DECL_EXC(Implementation, Logic);
-    DECL_EXC(Range, Logic);
-    DECL_EXC(Size, Logic);
-    // runtime errors
-    DECL_EXC(Runtime, std::runtime_error);
-    DECL_EXC(Argument, Runtime);
-    DECL_EXC(Io, Runtime);
-    DECL_EXC(Memory, Runtime);
-    DECL_EXC(Parsing, Runtime);
-    DECL_EXC(Program, Runtime);
-    DECL_EXC(System, Runtime);
-}
+public:
+    typedef typename FImpl::FermionField                      FermionField;
+    typedef FermionOperator<FImpl>                            FMat; 
+    typedef std::function<void(FermionField &, 
+                               const FermionField &)>         SolverFn;
+public:
+    Solver(SolverFn fn, FMat &mat): mat_(mat), fn_(fn) {}
+
+    void operator()(FermionField &sol, const FermionField &src)
+    {
+        fn_(sol, src);
+    }
+
+    FMat & getFMat(void)
+    {
+        return mat_;
+    }
+private:
+    FMat     &mat_;
+    SolverFn fn_;
+};
 
 END_HADRONS_NAMESPACE
 
-#endif // Hadrons_Exceptions_hpp_
+#endif // Hadrons_Solver_hpp_
diff --git a/Hadrons/TimerArray.cc b/Hadrons/TimerArray.cc
new file mode 100644
index 00000000..2b85bc7e
--- /dev/null
+++ b/Hadrons/TimerArray.cc
@@ -0,0 +1,126 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/TimerArray.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/TimerArray.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+void TimerArray::startTimer(const std::string &name)
+{
+    if (!name.empty())
+    {
+        timer_[name].Start();
+    }
+}
+
+GridTime TimerArray::getTimer(const std::string &name)
+{
+    GridTime t;
+    
+    if (!name.empty())
+    {
+        try
+        {
+            bool running = timer_.at(name).isRunning();
+
+            if (running) stopTimer(name);
+            t = timer_.at(name).Elapsed();
+            if (running) startTimer(name);
+        }
+        catch (std::out_of_range &)
+        {
+            t = GridTime::zero();
+        }
+    }
+    else
+    {
+        t = GridTime::zero();
+    }
+
+    return t;
+}
+
+double TimerArray::getDTimer(const std::string &name)
+{
+    return static_cast<double>(getTimer(name).count());
+}
+
+void TimerArray::startCurrentTimer(const std::string &name)
+{
+    if (!name.empty())
+    {
+        stopCurrentTimer();
+        startTimer(name);
+        currentTimer_ = name;
+    }
+}
+
+void TimerArray::stopTimer(const std::string &name)
+{
+    if (timer_.at(name).isRunning())
+    {
+        timer_.at(name).Stop();
+    }
+}
+
+void TimerArray::stopCurrentTimer(void)
+{
+    if (!currentTimer_.empty())
+    {
+        stopTimer(currentTimer_);
+        currentTimer_ = "";
+    }
+}
+
+void TimerArray::stopAllTimers(void)
+{
+    for (auto &t: timer_)
+    {
+        stopTimer(t.first);
+    }
+    currentTimer_ = "";
+}
+
+void TimerArray::resetTimers(void)
+{
+    timer_.clear();
+    currentTimer_ = "";
+}
+
+std::map<std::string, GridTime> TimerArray::getTimings(void)
+{
+    std::map<std::string, GridTime> timing;
+
+    for (auto &t: timer_)
+    {
+        timing[t.first] = t.second.Elapsed();
+    }
+
+    return timing;
+}
diff --git a/Hadrons/TimerArray.hpp b/Hadrons/TimerArray.hpp
new file mode 100644
index 00000000..77cc2b8c
--- /dev/null
+++ b/Hadrons/TimerArray.hpp
@@ -0,0 +1,56 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/TimerArray.hpp
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef Hadrons_TimerArray_hpp_
+#define Hadrons_TimerArray_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+class TimerArray
+{
+public:
+    TimerArray(void) = default;
+    virtual ~TimerArray(void) = default;
+    void                            startTimer(const std::string &name);
+    GridTime                        getTimer(const std::string &name);
+    double                          getDTimer(const std::string &name);
+    void                            startCurrentTimer(const std::string &name);
+    void                            stopTimer(const std::string &name);
+    void                            stopCurrentTimer(void);
+    void                            stopAllTimers(void);
+    void                            resetTimers(void);
+    std::map<std::string, GridTime> getTimings(void);
+private:
+    std::string                          currentTimer_;
+    std::map<std::string, GridStopWatch> timer_; 
+};
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_TimerArray_hpp_
diff --git a/Hadrons/Utilities/Contractor.cc b/Hadrons/Utilities/Contractor.cc
new file mode 100644
index 00000000..11ea0bc1
--- /dev/null
+++ b/Hadrons/Utilities/Contractor.cc
@@ -0,0 +1,454 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/Contractor.cc
+
+Copyright (C) 2015-2018
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#include <Hadrons/DiskVector.hpp>
+#include <Hadrons/TimerArray.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+#define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt)
+
+namespace Contractor
+{
+    class TrajRange: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange,
+                                        unsigned int, start,
+                                        unsigned int, end,
+                                        unsigned int, step);
+    };
+    
+    class GlobalPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar,
+                                        TrajRange, trajCounter,
+                                        unsigned int, nt,
+                                        std::string, diskVectorDir,
+                                        std::string, output);
+    };
+
+    class A2AMatrixPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar,
+                                        std::string, file,
+                                        std::string, dataset,
+                                        unsigned int, cacheSize,
+                                        std::string, name);
+    };
+
+    class ProductPar: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar,
+                                        std::string, terms,
+                                        std::vector<std::string>, times,
+                                        std::string, translations,
+                                        bool, translationAverage);
+    };
+
+    class CorrelatorResult: Serializable
+    {
+    public:
+        GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult,
+                                        std::vector<Contractor::A2AMatrixPar>,  a2aMatrix,
+                                        ProductPar, contraction,
+                                        std::vector<unsigned int>, times,
+                                        std::vector<ComplexD>, correlator);
+    };
+}
+
+struct ContractorPar
+{
+    Contractor::GlobalPar                  global;
+    std::vector<Contractor::A2AMatrixPar>  a2aMatrix;
+    std::vector<Contractor::ProductPar>    product;
+};
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times,
+                 std::vector<unsigned int> &current,
+                 const unsigned int depth)
+{
+    if (depth > 0)
+    {
+        for (auto t: times[times.size() - depth])
+        {
+            current[times.size() - depth] = t;
+            makeTimeSeq(timeSeq, times, current, depth - 1);
+        }
+    }
+    else
+    {
+        timeSeq.push_back(current);
+    }
+}
+
+void makeTimeSeq(std::vector<std::vector<unsigned int>> &timeSeq, 
+                 const std::vector<std::set<unsigned int>> &times)
+{
+    std::vector<unsigned int> current(times.size());
+
+    makeTimeSeq(timeSeq, times, current, times.size());
+}
+
+void saveCorrelator(const Contractor::CorrelatorResult &result, const std::string dir, 
+                    const unsigned int dt, const unsigned int traj)
+{
+    std::string              fileStem = "", filename;
+    std::vector<std::string> terms = strToVec<std::string>(result.contraction.terms);
+
+    for (unsigned int i = 0; i < terms.size() - 1; i++)
+    {
+        fileStem += terms[i] + "_" + std::to_string(result.times[i]) + "_";
+    }
+    fileStem += terms.back();
+    if (!result.contraction.translationAverage)
+    {
+        fileStem += "_dt_" + std::to_string(dt);
+    }
+    filename = dir + "/" + RESULT_FILE_NAME(fileStem, traj);
+    std::cout << "Saving correlator to '" << filename << "'" << std::endl;
+    makeFileDir(dir);
+    ResultWriter writer(filename);
+    write(writer, fileStem, result);
+}
+
+std::set<unsigned int> parseTimeRange(const std::string str, const unsigned int nt)
+{
+    std::regex               rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))");
+    std::smatch              sm;
+    std::vector<std::string> rstr = strToVec<std::string>(str);
+    std::set<unsigned int>   tSet;
+
+    for (auto &s: rstr)
+    {
+        std::regex_match(s, sm, rex);
+        if (sm[1].matched)
+        {
+            unsigned int t;
+            
+            t = std::stoi(sm[1].str());
+            if (t >= nt)
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            tSet.insert(t);
+        }
+        else if (sm[2].matched)
+        {
+            unsigned int ta, tb;
+
+            ta = std::stoi(sm[3].str());
+            tb = std::stoi(sm[4].str());
+            if ((ta >= nt) or (tb >= nt))
+            {
+                HADRONS_ERROR(Range, "time out of range (from expression '" + str + "')");
+            }
+            for (unsigned int ti = ta; ti <= tb; ++ti)
+            {
+                tSet.insert(ti);
+            }
+        }
+    }
+
+    return tSet;
+}
+
+struct Sec
+{
+    Sec(const double usec)
+    {
+        seconds = usec/1.0e6;
+    }
+    
+    double seconds;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Sec &&sec)
+{
+    s << std::setw(10) << sec.seconds << " sec";
+
+    return s;
+}
+
+struct Flops
+{
+    Flops(const double flops, const double fusec)
+    {
+        gFlopsPerSec = flops/fusec/1.0e3;
+    }
+    
+    double gFlopsPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Flops &&f)
+{
+    s << std::setw(10) << f.gFlopsPerSec << " GFlop/s";
+
+    return s;
+}
+
+struct Bytes
+{
+    Bytes(const double bytes, const double busec)
+    {
+        gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024;
+    }
+    
+    double gBytesPerSec;
+};
+
+inline std::ostream & operator<< (std::ostream& s, const Bytes &&b)
+{
+    s << std::setw(10) << b.gBytesPerSec << " GB/s";
+
+    return s;
+}
+
+int main(int argc, char* argv[])
+{
+    // parse command line
+    std::string   parFilename;
+
+    if (argc != 2)
+    {
+        std::cerr << "usage: " << argv[0] << " <parameter file>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    parFilename = argv[1];
+
+    // parse parameter file
+    ContractorPar par;
+    unsigned int  nMat, nCont;
+    XmlReader     reader(parFilename);
+
+    read(reader, "global",    par.global);
+    read(reader, "a2aMatrix", par.a2aMatrix);
+    read(reader, "product",   par.product);
+    nMat  = par.a2aMatrix.size();
+    nCont = par.product.size();
+
+    // create diskvectors
+    std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
+    unsigned int                                     cacheSize;
+
+    for (auto &p: par.a2aMatrix)
+    {
+        std::string dirName = par.global.diskVectorDir + "/" + p.name;
+
+        a2aMat.emplace(p.name, EigenDiskVector<ComplexD>(dirName, par.global.nt, p.cacheSize));
+    }
+
+    // trajectory loop
+    for (unsigned int traj = par.global.trajCounter.start; 
+         traj < par.global.trajCounter.end; traj += par.global.trajCounter.step)
+    {
+        std::cout << ":::::::: Trajectory " << traj << std::endl;
+
+        // load data
+        for (auto &p: par.a2aMatrix)
+        {
+            std::string filename = p.file;
+            double      t, size;
+
+            tokenReplace(filename, "traj", traj);
+            std::cout << "======== Loading '" << filename << "'" << std::endl;
+
+            A2AMatrixIo<HADRONS_A2AM_IO_TYPE> a2aIo(filename, p.dataset, par.global.nt);
+
+            a2aIo.load(a2aMat.at(p.name), &t);
+            std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6 
+                    << " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl;
+        }
+
+        // contract
+        EigenDiskVector<ComplexD>::Matrix buf;
+
+        for (auto &p: par.product)
+        {
+            std::vector<std::string>               term = strToVec<std::string>(p.terms);
+            std::vector<std::set<unsigned int>>    times;
+            std::vector<std::vector<unsigned int>> timeSeq;
+            std::set<unsigned int>                 translations;
+            std::vector<A2AMatrixTr<ComplexD>>     lastTerm(par.global.nt);
+            A2AMatrix<ComplexD>                    prod, buf, tmp;
+            TimerArray                             tAr;
+            double                                 fusec, busec, flops, bytes, tusec;
+            Contractor::CorrelatorResult           result;             
+
+            tAr.startTimer("Total");
+            std::cout << "======== Contraction tr(";
+            for (unsigned int g = 0; g < term.size(); ++g)
+            {
+                std::cout << term[g] << ((g == term.size() - 1) ? ')' : '*');
+            }
+            std::cout << std::endl;
+            if (term.size() != p.times.size() + 1)
+            {
+                HADRONS_ERROR(Size, "number of terms (" + std::to_string(term.size()) 
+                            + ") different from number of times (" 
+                            + std::to_string(p.times.size() + 1) + ")");
+            }
+            for (auto &s: p.times)
+            {
+                times.push_back(parseTimeRange(s, par.global.nt));
+            }
+            for (auto &m: par.a2aMatrix)
+            {
+                if (std::find(result.a2aMatrix.begin(), result.a2aMatrix.end(), m) == result.a2aMatrix.end())
+                {
+                    result.a2aMatrix.push_back(m);
+                    tokenReplace(result.a2aMatrix.back().file, "traj", traj);
+                }
+            }
+            result.contraction = p;
+            result.correlator.resize(par.global.nt, 0.);
+
+            translations = parseTimeRange(p.translations, par.global.nt);
+            makeTimeSeq(timeSeq, times);
+            std::cout << timeSeq.size()*translations.size()*(term.size() - 2) << " A*B, "
+                    << timeSeq.size()*translations.size()*par.global.nt << " tr(A*B)"
+                    << std::endl;
+
+            std::cout << "* Caching transposed last term" << std::endl;
+            for (unsigned int t = 0; t < par.global.nt; ++t)
+            {
+                tAr.startTimer("Disk vector overhead");
+                const A2AMatrix<ComplexD> &ref = a2aMat.at(term.back())[t];
+                tAr.stopTimer("Disk vector overhead");
+
+                tAr.startTimer("Transpose caching");
+                lastTerm[t].resize(ref.rows(), ref.cols());
+                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
+                for (unsigned int i = 0; i < ref.rows(); ++i)
+                {
+                    lastTerm[t](i, j) = ref(i, j);
+                }
+                tAr.stopTimer("Transpose caching");
+            }
+            bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
+            std::cout << Sec(tAr.getDTimer("Transpose caching")) << " " 
+                      << Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl;
+            for (unsigned int i = 0; i < timeSeq.size(); ++i)
+            {
+                unsigned int dti = 0;
+                auto         &t = timeSeq[i];
+
+                result.times = t;
+                for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                {
+                    result.correlator[tLast] = 0.;
+                }
+                for (auto &dt: translations)
+                {
+                    std::cout << "* Step " << i*translations.size() + dti + 1
+                            << "/" << timeSeq.size()*translations.size()
+                            << " -- positions= " << t << ", dt= " << dt << std::endl;
+                    if (term.size() > 2)
+                    {
+                        std::cout << std::setw(8) << "products";
+                    }
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("A*B algebra");
+                    busec  = tAr.getDTimer("A*B total");
+                    tAr.startTimer("Linear algebra");
+                    tAr.startTimer("Disk vector overhead");
+                    prod = a2aMat.at(term[0])[TIME_MOD(t[0] + dt)];
+                    tAr.stopTimer("Disk vector overhead");
+                    for (unsigned int j = 1; j < term.size() - 1; ++j)
+                    {
+                        tAr.startTimer("Disk vector overhead");
+                        const A2AMatrix<ComplexD> &ref = a2aMat.at(term[j])[TIME_MOD(t[j] + dt)];
+                        tAr.stopTimer("Disk vector overhead");
+                        
+                        tAr.startTimer("A*B total");
+                        tAr.startTimer("A*B algebra");
+                        A2AContraction::mul(tmp, prod, ref);
+                        tAr.stopTimer("A*B algebra");
+                        flops += A2AContraction::mulFlops(prod, ref);
+                        prod   = tmp;
+                        tAr.stopTimer("A*B total");
+                        bytes += 3.*tmp.rows()*tmp.cols()*sizeof(ComplexD);
+                    }
+                    if (term.size() > 2)
+                    {
+                        std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " "
+                                << Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " " 
+                                << Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl;
+                    }
+                    std::cout << std::setw(8) << "traces";
+                    flops  = 0.;
+                    bytes  = 0.;
+                    fusec  = tAr.getDTimer("tr(A*B)");
+                    busec  = tAr.getDTimer("tr(A*B)");
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        tAr.startTimer("tr(A*B)");
+                        A2AContraction::accTrMul(result.correlator[TIME_MOD(tLast - dt)], prod, lastTerm[tLast]);
+                        tAr.stopTimer("tr(A*B)");
+                        flops += A2AContraction::accTrMulFlops(prod, lastTerm[tLast]);
+                        bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD);
+                    }
+                    tAr.stopTimer("Linear algebra");
+                    std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " "
+                            << Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " " 
+                            << Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl;
+                    if (!p.translationAverage)
+                    {
+                        saveCorrelator(result, par.global.output, dt, traj);
+                        for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                        {
+                            result.correlator[tLast] = 0.;
+                        }
+                    }
+                    dti++;
+                }
+                if (p.translationAverage)
+                {
+                    for (unsigned int tLast = 0; tLast < par.global.nt; ++tLast)
+                    {
+                        result.correlator[tLast] /= translations.size();
+                    }
+                    saveCorrelator(result, par.global.output, 0, traj);
+                }
+            }
+            tAr.stopTimer("Total");
+            printTimeProfile(tAr.getTimings(), tAr.getTimer("Total"));
+        }
+    }
+    
+    return EXIT_SUCCESS;
+}
diff --git a/Hadrons/Utilities/Contractor.hpp b/Hadrons/Utilities/Contractor.hpp
new file mode 100644
index 00000000..9640c7c8
--- /dev/null
+++ b/Hadrons/Utilities/Contractor.hpp
@@ -0,0 +1,12 @@
+#ifndef  Hadrons_Contractor_hpp_
+#define Hadrons_Contractor_hpp_
+
+#include <Hadrons/Global.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_Contractor_hpp_
diff --git a/Hadrons/Utilities/ContractorBenchmark.cc b/Hadrons/Utilities/ContractorBenchmark.cc
new file mode 100644
index 00000000..1ec4657b
--- /dev/null
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -0,0 +1,434 @@
+#include <Hadrons/Global.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#ifdef USE_MKL
+#include "mkl.h"
+#include "mkl_cblas.h"
+#endif
+
+using namespace Grid;
+using namespace Hadrons;
+
+#ifdef GRID_COMMS_MPI3
+#define GET_RANK(rank, nMpi) \
+MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
+#define INIT() MPI_Init(NULL, NULL)
+#define FINALIZE() MPI_Finalize()
+#else
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define BARRIER()
+#define INIT()
+#define FINALIZE()
+#endif
+
+template <typename Function, typename MatLeft, typename MatRight>
+inline void trBenchmark(const std::string name, const MatLeft &left,
+                        const MatRight &right, const ComplexD ref, Function fn)
+{
+    double       t, flops, bytes, n = left[0].rows()*left[0].cols();
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    ComplexD     buf;
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);      
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(6.*n + 2.*(n - 1.));
+    bytes = nMat*(2.*n*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+template <typename Function, typename MatV, typename Mat>
+inline void mulBenchmark(const std::string name, const MatV &left,
+                         const MatV &right, const Mat &ref, Function fn)
+{
+    double       t, flops, bytes;
+    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    unsigned int nMat = left.size();
+    int          nMpi, rank;
+    Mat          buf(left[0].rows(), left[0].rows());
+
+    t = 0.;
+    GET_RANK(rank, nMpi);
+    t = -usecond();
+    BARRIER();
+    for (unsigned int i = rank*nMat/nMpi; i < (rank+1)*nMat/nMpi; ++i)
+    {
+        fn(buf, left[i], right[i]);
+    }
+    BARRIER();
+    t += usecond();
+    flops = nMat*(nr*nr*(6.*nc + 2.*(nc - 1.)));
+    bytes = nMat*(2*nc*nr*sizeof(ComplexD));
+
+    if (rank == 0)
+    {
+        std::cout << std::setw(34) << name << ": diff= "
+                  << std::setw(12) << (buf-ref).squaredNorm()
+                  << std::setw(10) << t/1.0e6 << " sec "
+                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
+                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
+                  << std::endl;
+    }
+    ::sleep(1);
+}
+
+#ifdef USE_MKL
+template <typename MatLeft, typename MatRight>
+static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aRow*a.cols();
+        aInc = 1;
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aRow;
+        aInc = a.rows();
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aRow;
+        bInc = b.cols();
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aRow*b.rows();
+        bInc = 1;
+    }
+    cblas_zdotu_sub(a.cols(), aPt, aInc, bPt, bInc, &res);
+}
+
+template <typename MatLeft, typename MatRight>
+static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
+                            const MatLeft &a, const MatRight &b)
+{
+    const ComplexD *aPt, *bPt;
+    unsigned int   aInc, bInc;
+
+    if (MatLeft::Options == Eigen::RowMajor)
+    {
+        aPt  = a.data() + aCol;
+        aInc = a.cols();
+    }
+    else if (MatLeft::Options == Eigen::ColMajor)
+    {
+        aPt  = a.data() + aCol*a.rows();
+        aInc = 1;
+    }
+    if (MatRight::Options == Eigen::RowMajor)
+    {
+        bPt  = b.data() + aCol*b.cols();
+        bInc = 1;
+    }
+    else if (MatRight::Options == Eigen::ColMajor)
+    {
+        bPt  = b.data() + aCol;
+        bInc = b.rows();
+    }
+    cblas_zdotu_sub(a.rows(), aPt, aInc, bPt, bInc, &res);
+}
+#endif
+
+template <typename MatLeft, typename MatRight>
+void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<MatLeft>  left;
+    std::vector<MatRight> right;
+    MatRight              buf;
+    ComplexD              ref;
+    int                   rank, nMpi;
+
+    left.resize(nMat, MatLeft::Random(ni, nj));
+    right.resize(nMat, MatRight::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== tr(A*B) benchmarks" << std::endl;
+        std::cout << "A matrices use ";
+        if (MatLeft::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatLeft::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << "B matrices use ";
+        if (MatRight::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (MatRight::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = (left.back()*right.back()).trace();
+    trBenchmark("Hadrons A2AContraction::accTrMul", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = 0.;
+        A2AContraction::accTrMul(res, a, b);
+    });
+    trBenchmark("Naive loop rows first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int i = 0; i < nr; ++i)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int j = 0; j < nc; ++j)
+            {
+                tmp += a(i, j)*b(j, i);
+            }
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Naive loop cols first", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        auto nr = a.rows(), nc = a.cols();
+        
+        res = 0.;
+        parallel_for (unsigned int j = 0; j < nc; ++j)
+        {
+            ComplexD tmp = 0.;
+
+            for (unsigned int i = 0; i < nr; ++i)
+            {
+                tmp += a(i, j)*b(j, i);
+            }        
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen tr(A*B)", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = (a*b).trace();
+    });
+    trBenchmark("Eigen row-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            tmp = a.row(r).conjugate().dot(b.col(r));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen col-wise dot", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            tmp = a.col(c).conjugate().dot(b.row(c));
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("Eigen Hadamard", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    { 
+        res = a.cwiseProduct(b.transpose()).sum();
+    });
+#ifdef USE_MKL
+    trBenchmark("MKL row-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        {
+            ComplexD tmp;
+
+            zdotuRow(tmp, r, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+    trBenchmark("MKL col-wise zdotu", left, right, ref,
+    [](ComplexD &res, const MatLeft &a, const MatRight &b)
+    {
+        res = 0.;
+        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        {
+            ComplexD tmp;
+
+            zdotuCol(tmp, c, a, b);
+            parallel_critical
+            {
+                res += tmp;
+            }
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+template <typename Mat>
+void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsigned int nMat)
+{
+    std::vector<Mat> left, right;
+    Mat              ref;
+    int              rank, nMpi;
+
+    left.resize(nMat, Mat::Random(ni, nj));
+    right.resize(nMat, Mat::Random(nj, ni));
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "==== A*B benchmarks" << std::endl;
+        std::cout << "all matrices use ";
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            std::cout << "row-major ordering" << std::endl;
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            std::cout << "col-major ordering" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+    BARRIER();
+    ref = left.back()*right.back();
+    mulBenchmark("Hadrons A2AContraction::mul", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        A2AContraction::mul(res, a, b);
+    });
+    mulBenchmark("Eigen A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    { 
+        res = a*b;
+    });
+#ifdef USE_MKL
+    mulBenchmark("MKL A*B", left, right, ref,
+    [](Mat &res, const Mat &a, const Mat &b)
+    {
+        const ComplexD one(1., 0.), zero(0., 0.);
+        if (Mat::Options == Eigen::RowMajor)
+        {
+            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
+                        res.data(), res.cols());
+        }
+        else if (Mat::Options == Eigen::ColMajor)
+        {
+            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
+                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
+                        res.data(), res.rows());
+        }
+    });
+#endif
+    BARRIER();
+    if (rank == 0)
+    {
+        std::cout << std::endl;
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    Eigen::Index ni, nj, nMat;
+    int          nMpi, rank;
+
+    if (argc != 4)
+    {
+        std::cerr << "usage: " << argv[0] << " <Ni> <Nj> <#matrices>";
+        std::cerr << std::endl;
+        
+        return EXIT_FAILURE;
+    }
+    ni   = std::stoi(argv[1]);
+    nj   = std::stoi(argv[2]);
+    nMat = std::stoi(argv[3]);
+
+    INIT();
+    GET_RANK(rank, nMpi);
+    if (rank == 0)
+    {
+        std::cout << "\n*** ALL-TO-ALL MATRIX CONTRACTION BENCHMARK ***\n" << std::endl;
+        std::cout << nMat << " couples of " << ni << "x" << nj << " matrices\n" << std::endl;
+
+        std::cout << nMpi << " MPI processes" << std::endl;
+#ifdef GRID_OMP
+        #pragma omp parallel
+        {
+            #pragma omp single
+            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
+        }
+#else
+        std::cout << "Single-threaded\n" << std::endl; 
+#endif
+
+#ifdef EIGEN_USE_MKL_ALL
+        std::cout << "Eigen uses the MKL" << std::endl;
+#endif
+        std::cout << "Eigen uses " << Eigen::nbThreads() << " threads" << std::endl;
+#ifdef USE_MKL
+        std::cout << "MKL   uses " << mkl_get_max_threads() << " threads" << std::endl;
+#endif
+        std::cout << std::endl;
+    }
+
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrix<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullTrBenchmark<A2AMatrixTr<ComplexD>, A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrix<ComplexD>>(ni, nj, nMat);
+    fullMulBenchmark<A2AMatrixTr<ComplexD>>(ni, nj, nMat);
+    FINALIZE();
+
+    return EXIT_SUCCESS;
+}
diff --git a/Hadrons/Utilities/EigenPackCast.cc b/Hadrons/Utilities/EigenPackCast.cc
new file mode 100644
index 00000000..8c3a2b3f
--- /dev/null
+++ b/Hadrons/Utilities/EigenPackCast.cc
@@ -0,0 +1,217 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Hadrons/Utilities/EigenPackCast.cc
+
+Copyright (C) 2015-2018
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Hadrons/EigenPack.hpp>
+#include <Hadrons/Environment.hpp>
+
+using namespace Grid;
+using namespace QCD;
+using namespace Hadrons;
+
+template <typename FOut, typename FIn>
+void convert(const std::string outFilename, const std::string inFilename, 
+             const unsigned int Ls, const bool rb, const unsigned int size, 
+             const bool multiFile, const bool testRead)
+{
+    assert(outFilename != inFilename);
+    
+    typedef EigenPack<FOut>            EPOut;
+    typedef EigenPack<FIn>             EPIn;
+    typedef typename FOut::vector_type VTypeOut;
+    typedef typename FIn::vector_type  VTypeIn;
+
+    std::shared_ptr<GridCartesian>         gInBase, gOutBase, gIn5, gOut5;
+    std::shared_ptr<GridRedBlackCartesian> rbgIn, rbgOut;
+    GridBase                               *gIn, *gOut;
+
+    auto         dim     = GridDefaultLatt();
+    unsigned int nd      = dim.size();
+    auto         simdOut = GridDefaultSimd(nd, VTypeOut::Nsimd());
+    auto         simdIn  = GridDefaultSimd(nd, VTypeIn::Nsimd());
+
+    gOutBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdOut, GridDefaultMpi()));
+    gInBase.reset(SpaceTimeGrid::makeFourDimGrid(dim, simdIn, GridDefaultMpi()));
+    if (rb)
+    {
+        if (Ls > 1)
+        {
+            rbgOut.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gOutBase.get()));
+            rbgIn.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gInBase.get()));
+        }
+        else
+        {
+            rbgOut.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gOutBase.get()));
+            rbgIn.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gInBase.get()));
+        }
+        gOut = rbgOut.get();
+        gIn  = rbgIn.get();
+    }
+    else
+    {
+        if (Ls > 1)
+        {
+            gOut5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gOutBase.get()));
+            gIn5.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gInBase.get()));
+            gOut = gOut5.get();
+            gIn  = gIn5.get();
+        }
+        else
+        {
+            gOut = gOutBase.get();
+            gIn  = gInBase.get();
+        }
+    }
+
+    FOut         bufOut(gOut);
+    FIn          bufIn(gIn), testIn(gIn);
+    ScidacWriter binWriter(gOut->IsBoss());
+    ScidacReader binReader;
+    PackRecord   record;
+    RealD        eval;
+
+    LOG(Message) << "==== EIGENPACK CONVERSION" << std::endl;
+    LOG(Message) << "Lattice       : " << gIn->GlobalDimensions() << std::endl;
+    LOG(Message) << "Checkerboarded: " << (rb ? "yes" : "no") << std::endl;
+    LOG(Message) << "In path       : " << inFilename  << std::endl;
+    LOG(Message) << "In type       : " << typeName<FIn>() << std::endl;
+    LOG(Message) << "Out path      : " << outFilename << std::endl;
+    LOG(Message) << "Out type      : " << typeName<FOut>() << std::endl;
+    LOG(Message) << "#vectors      : " << size << std::endl;
+    LOG(Message) << "Multifile     : " << (multiFile ? "yes" : "no") << std::endl;
+    LOG(Message) << "Test read     : " << (testRead ? "yes" : "no") << std::endl;
+    if (multiFile)
+    {
+        for(unsigned int k = 0; k < size; ++k)
+        {
+            std::string  outV = outFilename + "/v" + std::to_string(k) + ".bin";
+            std::string  inV  = inFilename + "/v" + std::to_string(k) + ".bin";
+
+            LOG(Message) << "==== Converting vector " << k << std::endl;
+            LOG(Message) << "In : " << inV  << std::endl;
+            LOG(Message) << "Out: " << outV << std::endl;
+            // conversion
+            LOG(Message) << "-- Doing conversion" << std::endl;
+            makeFileDir(outV, gOut);
+            binWriter.open(outV);
+            binReader.open(inV);
+            EigenPackIo::readHeader(record, binReader);
+            EigenPackIo::writeHeader(binWriter, record);
+            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
+            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
+            binWriter.close();
+            binReader.close();
+            // read test
+            if (testRead)
+            {
+                LOG(Message) << "-- Test read" << std::endl;
+                binReader.open(outV);
+                EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
+                binReader.close();
+            }
+        }
+    }
+    else
+    {
+        // conversion
+        LOG(Message) << "-- Doing conversion" << std::endl;
+        makeFileDir(outFilename, gOut);
+        binWriter.open(outFilename);
+        binReader.open(inFilename);
+        EigenPackIo::readHeader(record, binReader);
+        EigenPackIo::writeHeader(binWriter, record);
+        for(unsigned int k = 0; k < size; ++k)
+        {
+            EigenPackIo::readElement<FIn>(bufIn, eval, k, binReader);
+            EigenPackIo::writeElement<FIn, FOut>(binWriter, bufIn, eval, k, &bufOut, &testIn);
+        }
+        binWriter.close();
+        binReader.close();
+        // read test
+        if (testRead)
+        {
+            LOG(Message) << "-- Test read" << std::endl;
+            binReader.open(outFilename);
+            EigenPackIo::readHeader(record, binReader);
+            for(unsigned int k = 0; k < size; ++k)
+            {
+                EigenPackIo::readElement<FOut>(bufOut, eval, k, binReader);
+            }
+            binReader.close();
+        }
+    }
+}
+
+#ifndef FOUT
+#warning "FOUT undefined (set to WilsonImplF::FermionField by default)"
+#define FOUT WilsonImplF::FermionField
+#endif
+#ifndef FIN
+#warning "FIN undefined (set to WilsonImplD::FermionField by default)"
+#define FIN WilsonImplD::FermionField
+#endif
+
+int main(int argc, char *argv[])
+{
+    // parse command line
+    std::string  outFilename, inFilename;
+    unsigned int size, Ls;
+    bool         rb, multiFile, testRead;
+    
+    if (argc < 8)
+    {
+        std::cerr << "usage: " << argv[0] << " <out eigenpack> <in eigenpack> <Ls> <red-black {0|1}> <#vector> <multifile {0|1}> <test read {0|1}> [Grid options]";
+        std::cerr << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+    outFilename = argv[1];
+    inFilename  = argv[2];
+    Ls          = std::stoi(std::string(argv[3]));
+    rb          = (std::string(argv[4]) != "0");
+    size        = std::stoi(std::string(argv[5]));
+    multiFile   = (std::string(argv[6]) != "0");
+    testRead    = (std::string(argv[7]) != "0");
+    
+    // initialization
+    Grid_init(&argc, &argv);
+    initLogger();
+
+    // execution
+    try
+    {
+        convert<FOUT, FIN>(outFilename, inFilename, Ls, rb, size, multiFile, testRead);
+    }
+    catch (const std::exception& e)
+    {
+        Exceptions::abort(e);
+    }
+
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
diff --git a/extras/Hadrons/HadronsXmlRun.cc b/Hadrons/Utilities/HadronsXmlRun.cc
similarity index 81%
rename from extras/Hadrons/HadronsXmlRun.cc
rename to Hadrons/Utilities/HadronsXmlRun.cc
index ab858844..a78c8724 100644
--- a/extras/Hadrons/HadronsXmlRun.cc
+++ b/Hadrons/Utilities/HadronsXmlRun.cc
@@ -2,10 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/HadronsXmlRun.cc
+Source file: Hadrons/Utilities/HadronsXmlRun.cc
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -27,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/Application.hpp>
+#include <Hadrons/Application.hpp>
 
 using namespace Grid;
  
@@ -55,14 +54,10 @@ int main(int argc, char *argv[])
     
     // initialization
     Grid_init(&argc, &argv);
-    HadronsLogError.Active(GridLogError.isActive());
-    HadronsLogWarning.Active(GridLogWarning.isActive());
-    HadronsLogMessage.Active(GridLogMessage.isActive());
-    HadronsLogIterative.Active(GridLogIterative.isActive());
-    HadronsLogDebug.Active(GridLogDebug.isActive());
-    LOG(Message) << "Grid initialized" << std::endl;
     
     // execution
+    try
+    {
     Application application(parameterFileName);
     
     application.parseParameterFile(parameterFileName);
@@ -71,6 +66,11 @@ int main(int argc, char *argv[])
         application.loadSchedule(scheduleFileName);
     }
     application.run();
+    }
+    catch (const std::exception& e)
+    {
+        Exceptions::abort(e);
+    }
     
     // epilogue
     LOG(Message) << "Grid is finalizing now" << std::endl;
diff --git a/Hadrons/Utilities/Makefile.am b/Hadrons/Utilities/Makefile.am
new file mode 100644
index 00000000..4f324d6d
--- /dev/null
+++ b/Hadrons/Utilities/Makefile.am
@@ -0,0 +1,14 @@
+bin_PROGRAMS = HadronsXmlRun HadronsFermionEP64To32 HadronsContractor HadronsContractorBenchmark
+
+HadronsXmlRun_SOURCES = HadronsXmlRun.cc
+HadronsXmlRun_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsFermionEP64To32_SOURCES  = EigenPackCast.cc
+HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField
+HadronsFermionEP64To32_LDADD    = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractor_SOURCES = Contractor.cc
+HadronsContractor_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
+
+HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc
+HadronsContractorBenchmark_LDADD   = ../libHadrons.a ../../Grid/libGrid.a
diff --git a/extras/Hadrons/VirtualMachine.cc b/Hadrons/VirtualMachine.cc
similarity index 76%
rename from extras/Hadrons/VirtualMachine.cc
rename to Hadrons/VirtualMachine.cc
index 87645eed..09a2694b 100644
--- a/extras/Hadrons/VirtualMachine.cc
+++ b/Hadrons/VirtualMachine.cc
@@ -2,9 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/VirtualMachine.cc
+Source file: Hadrons/VirtualMachine.cc
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -26,9 +26,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 
-#include <Grid/Hadrons/VirtualMachine.hpp>
-#include <Grid/Hadrons/GeneticScheduler.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
+#include <Hadrons/VirtualMachine.hpp>
+#include <Hadrons/GeneticScheduler.hpp>
+#include <Hadrons/ModuleFactory.hpp>
 
 using namespace Grid;
  
@@ -48,6 +48,17 @@ unsigned int VirtualMachine::getTrajectory(void) const
     return traj_;
 }
 
+// run tag /////////////////////////////////////////////////////////////////////
+void VirtualMachine::setRunId(const std::string id)
+{
+    runId_ = id;
+}
+
+std::string VirtualMachine::getRunId(void) const
+{
+    return runId_;
+}
+
 // module management ///////////////////////////////////////////////////////////
 void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
 {
@@ -111,6 +122,7 @@ void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
             {
                 // output does not exists, add it
                 env().addObject(out, address);
+                module_[address].output.push_back(env().getObjectAddress(out));
             }
             else
             {
@@ -122,10 +134,11 @@ void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
                 else
                 {
                     // output already fully registered, error
-                    HADRON_ERROR(Definition, "object '" + out
+                    HADRONS_ERROR_REF(ObjectDefinition, "object '" + out
                                  + "' is already produced by module '"
                                  + module_[env().getObjectModule(out)].name
-                                 + "' (while pushing module '" + name + "')");
+                                 + "' (while pushing module '" + name + "')",
+                                 env().getObjectAddress(out));
                 }
                 if (getModule(address)->getReference().size() > 0)
                 {
@@ -157,7 +170,7 @@ void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
     }
     else
     {
-        HADRON_ERROR(Definition, "module '" + name + "' already exists");
+        HADRONS_ERROR(Definition, "module '" + name + "' already exists");
     }
 }
 
@@ -184,7 +197,7 @@ ModuleBase * VirtualMachine::getModule(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no module with address " + std::to_string(address));
     }
 }
 
@@ -201,7 +214,7 @@ unsigned int VirtualMachine::getModuleAddress(const std::string name) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with name '" + name + "'");
+        HADRONS_ERROR(Definition, "no module with name '" + name + "'");
     }
 }
 
@@ -213,7 +226,7 @@ std::string VirtualMachine::getModuleName(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no module with address " + std::to_string(address));
     }
 }
 
@@ -225,7 +238,7 @@ std::string VirtualMachine::getModuleType(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no module with address " + std::to_string(address));
     }
 }
 
@@ -249,6 +262,11 @@ std::string VirtualMachine::getModuleNamespace(const std::string name) const
     return getModuleNamespace(getModuleAddress(name));
 }
 
+int VirtualMachine::getCurrentModule(void) const
+{
+    return currentModule_;
+}
+
 bool VirtualMachine::hasModule(const unsigned int address) const
 {
     return (address < module_.size());
@@ -296,12 +314,66 @@ void VirtualMachine::makeModuleGraph(void)
     {
         for (auto &in: module_[m].input)
         {
-            graph.addEdge(env().getObjectModule(in), m);
+            int min = env().getObjectModule(in);
+
+            if (min < 0)
+            {
+                HADRONS_ERROR_REF(ObjectDefinition, "dependency '" 
+                             + env().getObjectName(in) + "' (address " 
+                             + std::to_string(in)
+                             + ") is not produced by any module", in);
+            }
+            else
+            {
+                graph.addEdge(min, m);
+            }
         }
     }
     graph_ = graph;
 }
 
+// dump GraphViz graph /////////////////////////////////////////////////////////
+void VirtualMachine::dumpModuleGraph(std::ostream &out)
+{
+    makeModuleGraph();
+    out << "digraph hadrons {" << std::endl;
+    out << "node [shape=record, fontname=\"Courier\", fontsize=\"11\"];" << std::endl;
+    out << "graph [fontname = \"Courier\", fontsize=\"11\"];" << std::endl;
+    out << "edge [fontname = \"Courier\", fontsize=\"11\"];"<< std::endl;
+    for (unsigned int m = 0; m < module_.size(); ++m)
+    {
+
+    }
+    for (unsigned int m = 0; m < module_.size(); ++m)
+    {
+        for (auto &in: module_[m].input)
+        {
+            int min = env().getObjectModule(in);
+
+            out << min << " -> " << m << " [ label = \""
+                << env().getObjectName(in) << "\" ];" << std::endl;
+        }
+    }
+    for (unsigned int m = 0; m < module_.size(); ++m)
+    {
+        out <<  m << " [ label = \"{<f0> " << getModule(m)->getRegisteredName()
+            << " |<f1> " << getModuleName(m) << "}\" ];" << std::endl;
+    }
+    out << "}\n" << std::endl;
+}
+
+void VirtualMachine::dumpModuleGraph(void)
+{
+    dumpModuleGraph(std::cout);
+}
+
+void VirtualMachine::dumpModuleGraph(const std::string filename)
+{
+    std::ofstream f(filename);
+
+    dumpModuleGraph(f);
+}
+
 // memory profile //////////////////////////////////////////////////////////////
 const VirtualMachine::MemoryProfile & VirtualMachine::getMemoryProfile(void)
 {
@@ -327,7 +399,6 @@ void VirtualMachine::makeMemoryProfile(void)
     env().protectObjects(false);
     GridLogMessage.Active(false);
     HadronsLogMessage.Active(false);
-    HadronsLogError.Active(false);
     for (auto it = program.rbegin(); it != program.rend(); ++it) 
     {
         auto a = *it;
@@ -335,7 +406,7 @@ void VirtualMachine::makeMemoryProfile(void)
         if (profile_.module[a].empty())
         {
             LOG(Debug) << "Profiling memory for module '" << module_[a].name
-                       << "' (" << a << ")..." << std::endl;
+                       << "' (" << a << ")" << std::endl;
             memoryProfile(a);
             env().freeAll();
         }
@@ -343,7 +414,6 @@ void VirtualMachine::makeMemoryProfile(void)
     env().protectObjects(protect);
     GridLogMessage.Active(gmsg);
     HadronsLogMessage.Active(hmsg);
-    HadronsLogError.Active(err);
     LOG(Debug) << "Memory profile:" << std::endl;
     LOG(Debug) << "----------------" << std::endl;
     for (unsigned int a = 0; a < profile_.module.size(); ++a)
@@ -413,25 +483,25 @@ void VirtualMachine::memoryProfile(const unsigned int address)
     auto m = getModule(address);
 
     LOG(Debug) << "Setting up module '" << m->getName() 
-               << "' (" << address << ")..." << std::endl;
+               << "' (" << address << ")" << std::endl;
     try
     {
+        currentModule_ = address;
         m->setup();
+        currentModule_ = -1;
         updateProfile(address);
     }
-    catch (Exceptions::Definition &)
+    catch (Exceptions::ObjectDefinition &exc)
     {
         cleanEnvironment();
-        for (auto &in: m->getInput())
+        if (!env().hasCreatedObject(exc.getAddress()))
         {
-            memoryProfile(env().getObjectModule(in));
+            LOG(Debug) << "Object '" << env().getObjectName(exc.getAddress())
+                       << "' missing for setup of '" << m->getName() 
+                       << "' (" << address << ")" << std::endl;
+            memoryProfile(env().getObjectModule(exc.getAddress()));
         }
-        for (auto &ref: m->getReference())
-        {
-            memoryProfile(env().getObjectModule(ref));
-        }
-        m->setup();
-        updateProfile(address);
+        memoryProfile(address);
     }
 }
 
@@ -530,9 +600,12 @@ VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
     };
     Scheduler scheduler(graph, memPeak, gpar);
     gen = 0;
+    scheduler.initPopulation();
+    LOG(Iterative) << "Start: " << sizeString(scheduler.getMinValue()) 
+                   << std::endl;
     do
     {
-        LOG(Debug) << "Generation " << gen << ":" << std::endl;
+        //LOG(Debug) << "Generation " << gen << ":" << std::endl;
         scheduler.nextGeneration();
         if (gen != 0)
         {
@@ -560,11 +633,12 @@ VirtualMachine::Program VirtualMachine::schedule(const GeneticPar &par)
 }
 
 // general execution ///////////////////////////////////////////////////////////
-#define BIG_SEP "==============="
-#define SEP     "---------------"
+#define BIG_SEP   "================"
+#define SEP       "----------------"
+#define SMALL_SEP "................"
 #define MEM_MSG(size) sizeString(size)
 
-void VirtualMachine::executeProgram(const Program &p) const
+void VirtualMachine::executeProgram(const Program &p)
 {
     Size            memPeak = 0, sizeBefore, sizeAfter;
     GarbageSchedule freeProg;
@@ -572,18 +646,54 @@ void VirtualMachine::executeProgram(const Program &p) const
     // build garbage collection schedule
     LOG(Debug) << "Building garbage collection schedule..." << std::endl;
     freeProg = makeGarbageSchedule(p);
+    for (unsigned int i = 0; i < freeProg.size(); ++i)
+    {
+        std::string msg = "";
+
+        for (auto &a: freeProg[i])
+        {
+            msg += env().getObjectName(a) + " ";
+        }
+        msg += "]";
+        LOG(Debug) << std::setw(4) << i + 1 << ": [" << msg << std::endl;
+    }
 
     // program execution
     LOG(Debug) << "Executing program..." << std::endl;
+    totalTime_ = GridTime::zero();
     for (unsigned int i = 0; i < p.size(); ++i)
     {
         // execute module
         LOG(Message) << SEP << " Measurement step " << i + 1 << "/"
                      << p.size() << " (module '" << module_[p[i]].name
                      << "') " << SEP << std::endl;
+        LOG(Message) << SMALL_SEP << " Module execution" << std::endl;
+        currentModule_ = p[i];
         (*module_[p[i]].data)();
+        currentModule_ = -1;
         sizeBefore = env().getTotalSize();
+        // print time profile after execution
+        LOG(Message) << SMALL_SEP << " Timings" << std::endl;
+
+        std::map<std::string, GridTime> ctiming, gtiming;
+        GridTime                        total;
+
+        ctiming  = module_[p[i]].data->getTimings();
+        total    = ctiming.at("_total");
+        gtiming["total"]     = ctiming["_total"];   ctiming.erase("_total");
+        gtiming["setup"]     = ctiming["_setup"];   ctiming.erase("_setup");
+        gtiming["execution"] = ctiming["_execute"]; ctiming.erase("_execute");
+        LOG(Message) << "* GLOBAL TIMERS" << std::endl;
+        printTimeProfile(gtiming, total);
+        if (!ctiming.empty())
+        {
+            LOG(Message) << "* CUSTOM TIMERS" << std::endl;
+            printTimeProfile(ctiming, total);
+        }
+        timeProfile_[module_[p[i]].name] = total;
+        totalTime_ += total;
         // print used memory after execution
+        LOG(Message) << SMALL_SEP << " Memory management" << std::endl;
         LOG(Message) << "Allocated objects: " << MEM_MSG(sizeBefore)
                      << std::endl;
         if (sizeBefore > memPeak)
@@ -608,9 +718,14 @@ void VirtualMachine::executeProgram(const Program &p) const
             LOG(Message) << "Nothing to free" << std::endl;
         }
     }
+    // print total time profile
+     LOG(Message) << SEP << " Measurement time profile" << SEP << std::endl;
+     LOG(Message) << "Total measurement time: " << totalTime_ << " us" << std::endl;
+     LOG(Message) << SMALL_SEP << " Module breakdown" << std::endl;
+     printTimeProfile(timeProfile_, totalTime_);
 }
 
-void VirtualMachine::executeProgram(const std::vector<std::string> &p) const
+void VirtualMachine::executeProgram(const std::vector<std::string> &p)
 {
     Program pAddress;
     
diff --git a/extras/Hadrons/VirtualMachine.hpp b/Hadrons/VirtualMachine.hpp
similarity index 87%
rename from extras/Hadrons/VirtualMachine.hpp
rename to Hadrons/VirtualMachine.hpp
index 3af7d914..9d0357f5 100644
--- a/extras/Hadrons/VirtualMachine.hpp
+++ b/Hadrons/VirtualMachine.hpp
@@ -2,9 +2,9 @@
 
 Grid physics library, www.github.com/paboyle/Grid 
 
-Source file: extras/Hadrons/VirtualMachine.hpp
+Source file: Hadrons/VirtualMachine.hpp
 
-Copyright (C) 2017
+Copyright (C) 2015-2018
 
 Author: Antonin Portelli <antonin.portelli@me.com>
 
@@ -29,9 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #ifndef Hadrons_VirtualMachine_hpp_
 #define Hadrons_VirtualMachine_hpp_
 
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Graph.hpp>
-#include <Grid/Hadrons/Environment.hpp>
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Graph.hpp>
+#include <Hadrons/Environment.hpp>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -84,13 +84,16 @@ private:
         const std::type_info      *type{nullptr};
         std::string               name;
         ModPt                     data{nullptr};
-        std::vector<unsigned int> input;
+        std::vector<unsigned int> input, output;
         size_t                    maxAllocated;
     };
 public:
     // trajectory counter
     void                setTrajectory(const unsigned int traj);
     unsigned int        getTrajectory(void) const;
+    // run tag
+    void                setRunId(const std::string id);
+    std::string         getRunId(void) const;
     // module management
     void                pushModule(ModPt &pt);
     template <typename M>
@@ -114,12 +117,17 @@ public:
     std::string         getModuleType(const std::string name) const;
     std::string         getModuleNamespace(const unsigned int address) const;
     std::string         getModuleNamespace(const std::string name) const;
+    int                 getCurrentModule(void) const;
     bool                hasModule(const unsigned int address) const;
     bool                hasModule(const std::string name) const;
     // print VM content
     void                printContent(void) const;
     // module graph (could be a const reference if topoSort was const)
     Graph<unsigned int> getModuleGraph(void);
+    // dump GraphViz graph
+    void                dumpModuleGraph(std::ostream &out);
+    void                dumpModuleGraph(void);
+    void                dumpModuleGraph(const std::string filename);
     // memory profile
     const MemoryProfile &getMemoryProfile(void);
     // garbage collector
@@ -129,8 +137,8 @@ public:
     // genetic scheduler
     Program             schedule(const GeneticPar &par);
     // general execution
-    void                executeProgram(const Program &p) const;
-    void                executeProgram(const std::vector<std::string> &p) const;
+    void                executeProgram(const Program &p);
+    void                executeProgram(const std::vector<std::string> &p);
 private:
     // environment shortcut
     DEFINE_ENV_ALIAS;
@@ -146,17 +154,21 @@ private:
     void memoryProfile(const unsigned int address);
 private:
     // general
+    std::string                         runId_;
     unsigned int                        traj_;
     // module and related maps
     std::vector<ModuleInfo>             module_;
     std::map<std::string, unsigned int> moduleAddress_;
-    std::string                         currentModule_{""};
+    int                                 currentModule_{-1};
     // module graph
     bool                                graphOutdated_{true};
     Graph<unsigned int>                 graph_;
     // memory profile
     bool                                memoryProfileOutdated_{true};
-    MemoryProfile                       profile_;
+    MemoryProfile                       profile_;     
+    // time profile
+    GridTime                            totalTime_;
+    std::map<std::string, GridTime>     timeProfile_;               
 };
 
 /******************************************************************************
@@ -190,7 +202,7 @@ M * VirtualMachine::getModule(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "module '" + module_[address].name
+        HADRONS_ERROR(Definition, "module '" + module_[address].name
                      + "' does not have type " + typeid(M).name()
                      + "(has type: " + getModuleType(address) + ")");
     }
diff --git a/Hadrons/add_module.sh b/Hadrons/add_module.sh
new file mode 100755
index 00000000..e37d19b1
--- /dev/null
+++ b/Hadrons/add_module.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+if (( $# != 2 )); then
+    echo "usage: `basename $0` <module name> <namespace>" 1>&2
+    exit 1
+fi
+NAME=$1
+NS=$2
+
+mkdir -p Modules/${NS}
+if [ -e "Modules/${NS}/${NAME}.cc" ] || [ -e "Modules/${NS}/${NAME}.hpp" ]; then
+	echo "error: files Modules/${NS}/${NAME}.* already exists" 1>&2
+	exit 1
+fi
+TMPCC=".${NS}.${NAME}.tmp.cc"
+TMPHPP=".${NS}.${NAME}.tmp.hpp"
+sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_in_NS.cc.template  > ${TMPCC}
+sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_in_NS.hpp.template > ${TMPHPP}
+sed "s/___NAMESPACE___/${NS}/g" ${TMPCC}  > Modules/${NS}/${NAME}.cc
+sed "s/___NAMESPACE___/${NS}/g" ${TMPHPP} > Modules/${NS}/${NAME}.hpp
+rm -f ${TMPCC} ${TMPHPP}
+./make_module_list.sh
diff --git a/Hadrons/add_module_template.sh b/Hadrons/add_module_template.sh
new file mode 100755
index 00000000..8ba112dd
--- /dev/null
+++ b/Hadrons/add_module_template.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+if (( $# != 2)); then
+    echo "usage: `basename $0` <module name> <namespace>" 1>&2
+    exit 1
+fi
+NAME=$1
+NS=$2
+
+mkdir -p Modules/${NS}
+if [ -e "Modules/${NS}/${NAME}.cc" ] || [ -e "Modules/${NS}/${NAME}.hpp" ]; then
+	echo "error: files Modules/${NS}/${NAME}.* already exists" 1>&2
+	exit 1
+fi
+TMPCC=".${NS}.${NAME}.tmp.cc"
+TMPHPP=".${NS}.${NAME}.tmp.hpp"
+sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_tmp_in_NS.cc.template  > ${TMPCC}
+sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_tmp_in_NS.hpp.template > ${TMPHPP}
+sed "s/___NAMESPACE___/${NS}/g" ${TMPCC}  > Modules/${NS}/${NAME}.cc
+sed "s/___NAMESPACE___/${NS}/g" ${TMPHPP} > Modules/${NS}/${NAME}.hpp
+rm -f ${TMPCC} ${TMPHPP}
+./make_module_list.sh
diff --git a/Hadrons/make_module_inst.sh b/Hadrons/make_module_inst.sh
new file mode 100755
index 00000000..32ead3f0
--- /dev/null
+++ b/Hadrons/make_module_inst.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+for m in `find Modules -name '*.hpp' -type f -print`; do
+    echo "====== ${m}"
+    CCFILE=`echo $m | sed -E s/\.hpp/.cc/g`
+    NS=`echo $m | awk -F '/' '{print $2}'`
+    NMOD=`grep -E 'MODULE_REGISTER_TMP.+<.+>.?' $m | wc -l`
+    if [ ! -e ${CCFILE} ] && (( NMOD != 0 )); then
+        echo "#include <Grid/Hadrons/${m}>" >> ${CCFILE}
+        echo "" >> ${CCFILE}
+        echo "using namespace Grid;" >> ${CCFILE}
+        echo "using namespace Hadrons;" >> ${CCFILE}
+        echo "using namespace ${NS};" >> ${CCFILE}
+        echo "" >> ${CCFILE}
+        for i in `grep -E 'MODULE_REGISTER_TMP.+<.+>.?' $m | sed -E 's/ +//g'`
+        do
+            TMPARG=`echo ${i} | grep -oE 'ARG\(.+>\)' | sed -E 's/^ARG\(//g' | sed -E 's/\)$//g'`
+            SUB=`echo ${i} | sed -E 's/ARG\(.+>\)/@arg@/g' | sed -E 's/,/|/g'`
+            SUB=`echo ${SUB} | sed -E 's/.+\(//g' | sed -E 's/\);//g'`
+            SUB=`echo ${SUB} | sed -E "s/@arg@/${TMPARG}/g"`
+            NAME=`echo ${SUB} | awk -F '|' '{print $1}'`
+            TYPE=`echo ${SUB} | awk -F '|' '{print $2}'`
+            echo "template class Grid::Hadrons::${NS}::${TYPE};" >> ${CCFILE}
+        done
+        echo "" >> ${CCFILE}
+    fi
+done
\ No newline at end of file
diff --git a/Hadrons/make_module_list.sh b/Hadrons/make_module_list.sh
new file mode 100755
index 00000000..750f4046
--- /dev/null
+++ b/Hadrons/make_module_list.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+echo 'modules_cc =\' > modules.inc
+find Modules -name '*.cc' -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> modules.inc
+echo '' >> modules.inc
+echo 'modules_hpp =\' >> modules.inc
+find Modules -name '*.hpp' -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> modules.inc
+echo '' >> modules.inc
+rm -f Modules.hpp
+for f in `find Modules -name '*.hpp'`; do
+	echo "#include <Hadrons/${f}>" >> Modules.hpp
+done
diff --git a/Hadrons/modules.inc b/Hadrons/modules.inc
new file mode 100644
index 00000000..88cfea88
--- /dev/null
+++ b/Hadrons/modules.inc
@@ -0,0 +1,148 @@
+modules_cc =\
+  Modules/MContraction/WeakHamiltonianEye.cc \
+  Modules/MContraction/Baryon.cc \
+  Modules/MContraction/Meson.cc \
+  Modules/MContraction/WeakNeutral4ptDisc.cc \
+  Modules/MContraction/WeakHamiltonianNonEye.cc \
+  Modules/MContraction/A2AAslashField.cc \
+  Modules/MContraction/WardIdentity.cc \
+  Modules/MContraction/A2AMesonField.cc \
+  Modules/MContraction/DiscLoop.cc \
+  Modules/MContraction/Gamma3pt.cc \
+  Modules/MFermion/FreeProp.cc \
+  Modules/MFermion/GaugeProp.cc \
+  Modules/MSource/Momentum.cc \
+  Modules/MSource/Point.cc \
+  Modules/MSource/Wall.cc \
+  Modules/MSource/SeqConserved.cc \
+  Modules/MSource/SeqGamma.cc \
+  Modules/MSource/Z2.cc \
+  Modules/MSink/Point.cc \
+  Modules/MSink/Smear.cc \
+  Modules/MSolver/A2AVectors.cc \
+  Modules/MSolver/A2AAslashVectors.cc \
+  Modules/MSolver/RBPrecCG.cc \
+  Modules/MSolver/MixedPrecisionRBPrecCG.cc \
+  Modules/MSolver/LocalCoherenceLanczos.cc \
+  Modules/MGauge/StoutSmearing.cc \
+  Modules/MGauge/Unit.cc \
+  Modules/MGauge/Electrify.cc \
+  Modules/MGauge/UnitEm.cc \
+  Modules/MGauge/StochEm.cc \
+  Modules/MGauge/Random.cc \
+  Modules/MGauge/FundtoHirep.cc \
+  Modules/MGauge/GaugeFix.cc \
+  Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
+  Modules/MNoise/FullVolumeSpinColorDiagonal.cc \
+  Modules/MUtilities/RandomVectors.cc \
+  Modules/MUtilities/TestSeqGamma.cc \
+  Modules/MUtilities/PrecisionCast.cc \
+  Modules/MUtilities/TestSeqConserved.cc \
+  Modules/MLoop/NoiseLoop.cc \
+  Modules/MScalar/FreeProp.cc \
+  Modules/MScalar/VPCounterTerms.cc \
+  Modules/MScalar/ChargedProp.cc \
+  Modules/MScalar/ScalarVP.cc \
+  Modules/MNPR/Amputate.cc \
+  Modules/MNPR/Bilinear.cc \
+  Modules/MNPR/FourQuark.cc \
+  Modules/MAction/Wilson.cc \
+  Modules/MAction/MobiusDWF.cc \
+  Modules/MAction/ZMobiusDWF.cc \
+  Modules/MAction/WilsonClover.cc \
+  Modules/MAction/DWF.cc \
+  Modules/MAction/ScaledDWF.cc \
+  Modules/MScalarSUN/TrPhi.cc \
+  Modules/MScalarSUN/Grad.cc \
+  Modules/MScalarSUN/TrMag.cc \
+  Modules/MScalarSUN/TrKinetic.cc \
+  Modules/MScalarSUN/EMT.cc \
+  Modules/MScalarSUN/ShiftProbe.cc \
+  Modules/MScalarSUN/TransProj.cc \
+  Modules/MScalarSUN/StochFreeField.cc \
+  Modules/MScalarSUN/TwoPoint.cc \
+  Modules/MScalarSUN/TwoPointNPR.cc \
+  Modules/MScalarSUN/Div.cc \
+  Modules/MIO/LoadEigenPack.cc \
+  Modules/MIO/LoadBinary.cc \
+  Modules/MIO/LoadNersc.cc \
+  Modules/MIO/LoadCoarseEigenPack.cc \
+  Modules/MIO/LoadCosmHol.cc \
+  Modules/MIO/LoadA2AVectors.cc
+
+modules_hpp =\
+  Modules/MContraction/Baryon.hpp \
+  Modules/MContraction/A2AAslashField.hpp \
+  Modules/MContraction/A2AMesonField.hpp \
+  Modules/MContraction/Meson.hpp \
+  Modules/MContraction/WeakHamiltonian.hpp \
+  Modules/MContraction/WeakHamiltonianNonEye.hpp \
+  Modules/MContraction/DiscLoop.hpp \
+  Modules/MContraction/WeakNeutral4ptDisc.hpp \
+  Modules/MContraction/Gamma3pt.hpp \
+  Modules/MContraction/WardIdentity.hpp \
+  Modules/MContraction/WeakHamiltonianEye.hpp \
+  Modules/MFermion/FreeProp.hpp \
+  Modules/MFermion/GaugeProp.hpp \
+  Modules/MSource/SeqGamma.hpp \
+  Modules/MSource/Point.hpp \
+  Modules/MSource/Wall.hpp \
+  Modules/MSource/Z2.hpp \
+  Modules/MSource/SeqConserved.hpp \
+  Modules/MSource/Momentum.hpp \
+  Modules/MSink/Smear.hpp \
+  Modules/MSink/Point.hpp \
+  Modules/MSolver/MixedPrecisionRBPrecCG.hpp \
+  Modules/MSolver/LocalCoherenceLanczos.hpp \
+  Modules/MSolver/Guesser.hpp \
+  Modules/MSolver/RBPrecCG.hpp \
+  Modules/MSolver/A2AVectors.hpp \
+  Modules/MSolver/A2AAslashVectors.hpp \
+  Modules/MGauge/UnitEm.hpp \
+  Modules/MGauge/StoutSmearing.hpp \
+  Modules/MGauge/Unit.hpp \
+  Modules/MGauge/Electrify.hpp \
+  Modules/MGauge/Random.hpp \
+  Modules/MGauge/GaugeFix.hpp \
+  Modules/MGauge/FundtoHirep.hpp \
+  Modules/MGauge/StochEm.hpp \
+  Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
+  Modules/MNoise/FullVolumeSpinColorDiagonal.hpp \
+  Modules/MUtilities/PrecisionCast.hpp \
+  Modules/MUtilities/RandomVectors.hpp \
+  Modules/MUtilities/TestSeqGamma.hpp \
+  Modules/MUtilities/TestSeqConserved.hpp \
+  Modules/MLoop/NoiseLoop.hpp \
+  Modules/MScalar/FreeProp.hpp \
+  Modules/MScalar/VPCounterTerms.hpp \
+  Modules/MScalar/ScalarVP.hpp \
+  Modules/MScalar/Scalar.hpp \
+  Modules/MScalar/ChargedProp.hpp \
+  Modules/MNPR/Bilinear.hpp \
+  Modules/MNPR/Amputate.hpp \
+  Modules/MNPR/FourQuark.hpp \
+  Modules/MAction/DWF.hpp \
+  Modules/MAction/MobiusDWF.hpp \
+  Modules/MAction/Wilson.hpp \
+  Modules/MAction/WilsonClover.hpp \
+  Modules/MAction/ZMobiusDWF.hpp \
+  Modules/MAction/ScaledDWF.hpp \
+  Modules/MScalarSUN/StochFreeField.hpp \
+  Modules/MScalarSUN/TwoPointNPR.hpp \
+  Modules/MScalarSUN/ShiftProbe.hpp \
+  Modules/MScalarSUN/Div.hpp \
+  Modules/MScalarSUN/TrMag.hpp \
+  Modules/MScalarSUN/EMT.hpp \
+  Modules/MScalarSUN/TwoPoint.hpp \
+  Modules/MScalarSUN/TrPhi.hpp \
+  Modules/MScalarSUN/Utils.hpp \
+  Modules/MScalarSUN/TransProj.hpp \
+  Modules/MScalarSUN/Grad.hpp \
+  Modules/MScalarSUN/TrKinetic.hpp \
+  Modules/MIO/LoadEigenPack.hpp \
+  Modules/MIO/LoadNersc.hpp \
+  Modules/MIO/LoadA2AVectors.hpp \
+  Modules/MIO/LoadCosmHol.hpp \
+  Modules/MIO/LoadCoarseEigenPack.hpp \
+  Modules/MIO/LoadBinary.hpp
+
diff --git a/Makefile.am b/Makefile.am
index 3a65cf1b..d905b8e8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,17 +1,16 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = lib benchmarks tests extras
+SUBDIRS = Grid Hadrons benchmarks tests
 
 include $(top_srcdir)/doxygen.inc
 
 bin_SCRIPTS=grid-config
 
-
 .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
 
 tests-local: all
 bench-local: all
 check-local: all
 
-AM_CXXFLAGS += -I$(top_builddir)/include
+AM_CXXFLAGS += -I$(top_builddir)
 
 ACLOCAL_AMFLAGS = -I m4
diff --git a/README.md b/README.md
index 13dd6996..86506f52 100644
--- a/README.md
+++ b/README.md
@@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used:
 | `<code>`    | Description                            |
 | ----------- | -------------------------------------- |
 | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
+| `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
 | `BGQ`       | Blue Gene/Q                            |
 
 #### Notes:
-- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
+- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
diff --git a/TODO b/TODO
index 746302ca..a7a34fdd 100644
--- a/TODO
+++ b/TODO
@@ -1,8 +1,69 @@
 TODO:
 ---------------
 
-Code item work list
 
+GPU branch code item work list
+-----------------------------
+
+- Audit NAMESPACE CHANGES
+- Audit HMC timestep / traj length size
+- Verify HMC one flavour ratio; suspect dH too big
+- pragma once uniformly
+- GPU offload reductions; thrust initial ; inclusive_scan vs reduce?
+- Audit changes
+- thread_loop interface revisit.
+
+- - Need (1) omp parallel for     <-- thread_loop
+- -      (2) omp for
+- -      (3) omp for collapse(n)
+- -      (4) omp parallel for collapse(n)
+- - Only (1) has a natural mirror in accelerator_loop
+- - Nested loop macros get cumbersome
+- - Don't like thread_region and thread_loop_in_region
+- - Could replace with 
+
+    thread_nested(1, 
+      for {
+
+      }
+    );
+    thread_nested(2,
+      for (){
+        for (){
+
+	}
+      }
+    );
+
+    and same "in_region".
+
+- Remove old parallel_for macros, fix errors
+- check accelerator_loop uniformly used in fermion operators
+- Gamma tables on GPU
+- Accelerate the cshift
+- Accelerate non-dslash elements of Mobius
+- Mobius kernel fusion.
+- Staggered kernels inline for GPU
+- Reread WilsonKernels and check diffs
+- Single GPU simd target (VGPU)
+- Lebesgue reorder in all kernels
+- merge2 where used. Audit routines, comment out and check compile.
+- Pragmas.h - prune and remove strong_inline (?)
+- 
+-----------------------------
+Physics item work list:
+
+2)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+4)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+5)- HDCR resume
+
+-----------------------------
+Nov 2018
+
+1)- BG/Q port and check ; Andrew says ok.
+3)- Physical propagator interface  -- DONE
+
+DONE
 a) namespaces & indentation
  GRID_BEGIN_NAMESPACE();
  GRID_END_NAMESPACE();
@@ -16,14 +77,6 @@ b) GPU branch
 - Start port once Nvidia box is up
 - Cut down volume of code for first port? How?
 
-Physics item work list:
-
-1)- BG/Q port and check ; Andrew says ok.
-2)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
-3)- Physical propagator interface
-4)- Multigrid Wilson and DWF, compare to other Multigrid implementations
-5)- HDCR resume
-
 ----------------------------
 Recent DONE 
 -- RNG I/O in ILDG/SciDAC (minor) 
@@ -162,6 +215,8 @@ RECENT
 
 
 DONE:
+
+
 - MultiArray -- MultiRHS done
 - ConjugateGradientMultiShift -- DONE
 - MCR                         -- DONE
diff --git a/VERSION b/VERSION
index bfad377d..a0211af1 100644
--- a/VERSION
+++ b/VERSION
@@ -1,4 +1,4 @@
-Version : 0.7.0
+Version : 0.8.0
 
 - Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
 - MPI and MPI3 comms optimisations for KNL and OPA finished
diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc
new file mode 100644
index 00000000..3d3b0ce0
--- /dev/null
+++ b/benchmarks/Benchmark_IO.cc
@@ -0,0 +1,47 @@
+
+#include "Benchmark_IO.hpp"
+
+#ifndef BENCH_IO_LMAX
+#define BENCH_IO_LMAX 40
+#endif
+
+using namespace Grid;
+
+std::string filestem(const int l)
+{
+  return "iobench_l" + std::to_string(l);
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int64_t threads = GridThread::GetThreads();
+  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
+  MSG << SEP << std::endl;
+  MSG << "Benchmark Lime write" << std::endl;
+  MSG << SEP << std::endl;
+  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  {
+    auto             mpi  = GridDefaultMpi();
+    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+    std::cout << "-- Local volume " << l << "^4" << std::endl;
+    writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>);
+  }
+
+  MSG << "Benchmark Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
+  {
+    auto             mpi  = GridDefaultMpi();
+    std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
+
+    std::cout << "-- Local volume " << l << "^4" << std::endl;
+    readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>);
+  }
+
+  Grid_finalize();
+
+  return EXIT_SUCCESS;
+}
diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp
new file mode 100644
index 00000000..91fcb61f
--- /dev/null
+++ b/benchmarks/Benchmark_IO.hpp
@@ -0,0 +1,107 @@
+#ifndef Benchmark_IO_hpp_
+#define Benchmark_IO_hpp_
+
+#include <Grid/Grid.h>
+
+#define MSG std::cout << GridLogMessage
+#define SEP \
+"============================================================================="
+
+namespace Grid {
+
+template <typename Field>
+using WriterFn = std::function<void(const std::string, Field &)> ;
+template <typename Field>
+using ReaderFn = std::function<void(Field &, const std::string)>;
+
+template <typename Field>
+void limeWrite(const std::string filestem, Field &vec)
+{
+  emptyUserRecord   record;
+  ScidacWriter binWriter(vec.Grid()->IsBoss());
+
+  binWriter.open(filestem + ".bin");
+  binWriter.writeScidacFieldRecord(vec, record);
+  binWriter.close();
+}
+
+template <typename Field>
+void limeRead(Field &vec, const std::string filestem)
+{
+  emptyUserRecord   record;
+  ScidacReader binReader;
+
+  binReader.open(filestem + ".bin");
+  binReader.readScidacFieldRecord(vec, record);
+  binReader.close();
+}
+
+inline void makeGrid(std::shared_ptr<GridBase> &gPt, 
+                     const std::shared_ptr<GridCartesian> &gBasePt,
+                     const unsigned int Ls = 1, const bool rb = false)
+{
+  if (rb)
+  {
+    if (Ls > 1)
+    {
+      gPt.reset(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, gBasePt.get()));
+    }
+    else
+    {
+      gPt.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(gBasePt.get()));
+    }
+  }
+  else
+  {
+    if (Ls > 1)
+    {
+        gPt.reset(SpaceTimeGrid::makeFiveDimGrid(Ls, gBasePt.get()));
+    }
+    else
+    {
+        gPt = gBasePt;
+    }
+  }
+}
+
+template <typename Field>
+void writeBenchmark(const Coordinate &latt, const std::string filename,
+                    const WriterFn<Field> &write, 
+                    const unsigned int Ls = 1, const bool rb = false)
+{
+  auto                           mpi  = GridDefaultMpi();
+  auto                           simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
+  std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+  std::shared_ptr<GridBase>      gPt;
+
+  makeGrid(gPt, gBasePt, Ls, rb);
+
+  GridBase                       *g = gPt.get();
+  GridParallelRNG                rng(g);
+  Field                          vec(g);
+
+  random(rng, vec);
+  write(filename, vec);
+}
+
+template <typename Field>
+void readBenchmark(const Coordinate &latt, const std::string filename,
+                   const ReaderFn<Field> &read, 
+                   const unsigned int Ls = 1, const bool rb = false)
+{
+  auto                           mpi  = GridDefaultMpi();
+  auto                           simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
+  std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+  std::shared_ptr<GridBase>      gPt;
+
+  makeGrid(gPt, gBasePt, Ls, rb);
+
+  GridBase                       *g = gPt.get();
+  Field                          vec(g);
+
+  read(vec, filename);
+}
+
+}
+
+#endif // Benchmark_IO_hpp_
diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc
new file mode 100644
index 00000000..15f6194f
--- /dev/null
+++ b/benchmarks/Benchmark_IO_vs_dir.cc
@@ -0,0 +1,79 @@
+#include "Benchmark_IO.hpp"
+
+#define MSG std::cout << GridLogMessage
+#define SEP \
+"============================================================================="
+
+using namespace Grid;
+using namespace QCD;
+
+int main (int argc, char ** argv)
+{
+  std::vector<std::string> dir;
+  unsigned int             Ls;
+  bool                     rb;
+  if (argc < 4)
+  {
+    std::cerr << "usage: " << argv[0] << " <Ls> <RB {0|1}> <dir1> [<dir2> ... <dirn>] [Grid options]";
+    std::cerr << std::endl;
+  }
+  Ls = std::stoi(argv[1]);
+  rb = (std::string(argv[2]) == "1");
+  for (unsigned int i = 3; i < argc; ++i)
+  {
+    std::string a = argv[i];
+
+    if (a[0] != '-')
+    {
+      dir.push_back(std::string(argv[i]));
+    }
+    else
+    {
+      break;
+    }
+  }
+  Grid_init(&argc,&argv);
+
+
+  int64_t threads = GridThread::GetThreads();
+  MSG << "Grid is setup to use " << threads << " threads" << std::endl;
+  MSG << SEP << std::endl;
+  MSG << "Benchmark double precision Lime write" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb);
+  }
+
+  MSG << SEP << std::endl;
+  MSG << "Benchmark double precision Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb);
+  }
+
+  MSG << SEP << std::endl;
+  MSG << "Benchmark single precision Lime write" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb);
+  }
+
+  MSG << SEP << std::endl;
+  MSG << "Benchmark single precision Lime read" << std::endl;
+  MSG << SEP << std::endl;
+  for (auto &d: dir)
+  {
+    MSG << "-- Directory " << d << std::endl;
+    readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb);
+  }
+
+  Grid_finalize();
+
+  return EXIT_SUCCESS;
+}
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 0f95b115..989d810d 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -446,7 +446,7 @@ int main (int argc, char ** argv)
   }    
 
 
-
+#ifdef GRID_OMP
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -502,9 +502,9 @@ int main (int argc, char ** argv)
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
-
+            int tid = omp_get_thread_num();
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
+					       (void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
 
 	    thread_critical { dbytes+=tbytes; }
 	  }
@@ -531,7 +531,7 @@ int main (int argc, char ** argv)
  
     }
   }    
-
+#endif
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 6914aba9..f9ec8c88 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -48,7 +48,6 @@ int main (int argc, char ** argv)
 
 
   int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   Coordinate latt4 = GridDefaultLatt();
   int Ls=8;
@@ -57,6 +56,10 @@ int main (int argc, char ** argv)
       std::stringstream ss(argv[i+1]); ss >> Ls;
     }
 
+  GridLogLayout();
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -73,9 +76,9 @@ int main (int argc, char ** argv)
   std::vector<int> seeds5({5,6,7,8});
   
   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
   std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 
   LatticeFermion src   (FGrid); random(RNG5,src);
@@ -193,7 +196,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -231,7 +234,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
 
     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -283,7 +286,7 @@ int main (int argc, char ** argv)
     double t1=usecond();
     FGrid->Barrier();
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -359,7 +362,7 @@ int main (int argc, char ** argv)
       //      sDw.stat.print();
 
       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
 
       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
       std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
@@ -491,7 +494,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
 
     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index 3dd02962..1e380053 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -50,6 +50,7 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
+
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
@@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
 
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
@@ -200,7 +202,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 #endif  
   if ( ! report ) {
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
     std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
   }
   
@@ -232,7 +234,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
     
     if(!report){
       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-      double flops=(1344.0*volume*ncall)/2;
+      double flops=(single_site_flops*volume*ncall)/2.0;
       std::cout<< flops/(t1-t0);
     }
   }
@@ -241,6 +243,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 #define CHECK_SDW
 void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 {
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
 
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -333,7 +336,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 
   if ( !report){
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
+    double flops=single_site_flops*volume*ncall;
     std::cout<<"\t"<< flops/(t1-t0);
   }
 
@@ -375,7 +378,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 
   if ( ! report ) {
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(1344.0*volume*ncall)/2;
+    double flops=(single_site_flops*volume*ncall)/2.0;
     std::cout<<"\t"<< flops/(t1-t0);
   }
 }
diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc
index cb89ec25..b03e1b63 100644
--- a/benchmarks/Benchmark_gparity.cc
+++ b/benchmarks/Benchmark_gparity.cc
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@@ -134,7 +134,7 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
 
     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
@@ -174,7 +174,7 @@ int main (int argc, char ** argv)
     FGrid_d->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=2*1344*volume*ncall;
+    double flops=2*1320*volume*ncall;
 
     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
diff --git a/benchmarks/Benchmark_meson_field.cc b/benchmarks/Benchmark_meson_field.cc
new file mode 100644
index 00000000..8fcb926f
--- /dev/null
+++ b/benchmarks/Benchmark_meson_field.cc
@@ -0,0 +1,812 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_wilson.cc
+
+    Copyright (C) 2018
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+#include "Grid/util/Profiling.h"
+
+template<class vobj>
+void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat, 
+				 const std::vector<Lattice<vobj> > &lhs,
+				 const std::vector<Lattice<vobj> > &rhs,
+				 int orthogdim) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Lblock = lhs.size();
+  int Rblock = rhs.size();
+
+  GridBase *grid = lhs[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+
+  assert(mat.size()==Lblock*Rblock);
+  for(int t=0;t<mat.size();t++){
+    assert(mat[t].size()==Nt);
+  }
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd*Lblock*Rblock);
+  parallel_for (int r = 0; r < rd * Lblock * Rblock; r++){
+    lvSum[r] = Zero();
+  }
+
+  std::vector<scalar_type > lsSum(ld*Lblock*Rblock,scalar_type(0.0));             
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  
+  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
+  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	for(int i=0;i<Lblock;i++){
+	  auto lhs_v = lhs[i].View();
+	  auto left = conjugate(lhs_v[ss]);
+	  for(int j=0;j<Rblock;j++){
+	    int idx = i+Lblock*j+Lblock*Rblock*r;
+	    auto rhs_v = rhs[j].View();
+	    auto right = rhs_v[ss];
+	    vector_type vv = left()(0)(0) * right()(0)(0)
+	      +              left()(0)(1) * right()(0)(1)
+	      +              left()(0)(2) * right()(0)(2)
+              +              left()(1)(0) * right()(1)(0)
+	      +              left()(1)(1) * right()(1)(1)
+	      +              left()(1)(2) * right()(1)(2)
+              +              left()(2)(0) * right()(2)(0)
+	      +              left()(2)(1) * right()(2)(1)
+	      +              left()(2)(2) * right()(2)(2)
+              +              left()(3)(0) * right()(3)(0)
+	      +              left()(3)(1) * right()(3)(1)
+	      +              left()(3)(2) * right()(3)(2);
+	    lvSum[idx]=lvSum[idx]+vv;
+	  }
+	}
+      }
+    }
+  }
+
+  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  parallel_for(int rt=0;rt<rd;rt++){
+
+    Coordinate icoor(Nd);
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+
+      iScalar<vector_type> temp; 
+      ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
+
+      temp._internal = lvSum[i+Lblock*j+Lblock*Rblock*rt];
+
+      extract(temp,extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx =rt+icoor[orthogdim]*rd;
+      
+	int ij_dx = i+Lblock*j+Lblock*Rblock*ldx;
+	lsSum[ij_dx]=lsSum[ij_dx]+extracted[idx]._internal;
+
+      }
+    }}
+  }
+
+  std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
+  for(int t=0;t<fd;t++)
+  {
+    int pt = t / ld; // processor plane
+    int lt = t % ld;
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+      if (pt == grid->_processor_coor[orthogdim]){
+        int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
+        mat[i+j*Lblock][t] = lsSum[ij_dx];
+      }
+      else{
+        mat[i+j*Lblock][t] = scalar_type(0.0);
+      }
+    }}
+  }
+  std::cout << GridLogMessage << " Done "<<std::endl;
+  // defer sum over nodes.
+  return;
+}
+
+template<class vobj>
+void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat, 
+				      const std::vector<Lattice<vobj> > &lhs,
+				      const std::vector<Lattice<vobj> > &rhs,
+				      int orthogdim,
+				      std::vector<Gamma::Algebra> gammas) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Lblock = lhs.size();
+  int Rblock = rhs.size();
+
+  GridBase *grid = lhs[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+
+  assert(mat.size()==Lblock*Rblock*Ngamma);
+  for(int t=0;t<mat.size();t++){
+    assert(mat[t].size()==Nt);
+  }
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock*Ngamma;
+  int MFlvol = ld*Lblock*Rblock*Ngamma;
+
+  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(MFrvol);
+  parallel_for (int r = 0; r < MFrvol; r++){
+    lvSum[r] = Zero();
+  }
+
+  std::vector<scalar_type > lsSum(MFlvol);             
+  parallel_for (int r = 0; r < MFlvol; r++){
+    lsSum[r]=scalar_type(0.0);
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  
+  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
+
+  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	for(int i=0;i<Lblock;i++){
+ 	  auto lhs_v=lhs[i].View();
+	  auto left = conjugate(lhs_v[ss]);
+	  for(int j=0;j<Rblock;j++){
+	  for(int mu=0;mu<Ngamma;mu++){
+	    
+	    auto rhs_v = rhs[j].View();
+	    auto right = Gamma(gammas[mu])*rhs_v[ss];
+
+	      vector_type vv = left()(0)(0) * right()(0)(0)
+		+              left()(0)(1) * right()(0)(1)
+		+              left()(0)(2) * right()(0)(2)
+		+              left()(1)(0) * right()(1)(0)
+		+              left()(1)(1) * right()(1)(1)
+		+              left()(1)(2) * right()(1)(2)
+		+              left()(2)(0) * right()(2)(0)
+		+              left()(2)(1) * right()(2)(1)
+		+              left()(2)(2) * right()(2)(2)
+		+              left()(3)(0) * right()(3)(0)
+		+              left()(3)(1) * right()(3)(1)
+		+              left()(3)(2) * right()(3)(2);
+
+	      int idx = mu+i*Ngamma+Lblock*Ngamma*j+Ngamma*Lblock*Rblock*r;
+
+	      lvSum[idx]=lvSum[idx]+vv;
+	    }
+	  }
+	}
+      }
+    }
+  }
+
+  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  parallel_for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    Coordinate icoor(Nd);
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int mu=0;mu<Ngamma;mu++){
+
+      int ij_rdx = mu+i*Ngamma+Ngamma*Lblock*j+Ngamma*Lblock*Rblock*rt;
+      temp._internal = lvSum[ij_rdx];
+
+      extract(temp,extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx =rt+icoor[orthogdim]*rd;
+      
+	int ij_ldx = mu+i*Ngamma+Ngamma*Lblock*j+Ngamma*Lblock*Rblock*ldx;
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
+
+      }
+    }}}
+  }
+
+  std::cout << GridLogMessage << " Entering non parallel loop "<<std::endl;
+  for(int t=0;t<fd;t++)
+  {
+    int pt = t / ld; // processor plane
+    int lt = t % ld;
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int mu=0;mu<Ngamma;mu++){
+      if (pt == grid->_processor_coor[orthogdim]){
+        int ij_dx = mu+i*Ngamma+Ngamma*Lblock*j+Ngamma*Lblock*Rblock* lt;
+        mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = lsSum[ij_dx];
+      }
+      else{
+        mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = scalar_type(0.0);
+      }
+    }}}
+  }
+  std::cout << GridLogMessage << " Done "<<std::endl;
+  // defer sum over nodes.
+  return;
+}
+
+
+template<class vobj>
+void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat, 
+				      const std::vector<Lattice<vobj> > &lhs,
+				      const std::vector<Lattice<vobj> > &rhs,
+				      int orthogdim,
+				      std::vector<Gamma::Algebra> gammas) 
+{
+
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef iSpinMatrix<vector_type> SpinMatrix_v;
+  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+  
+  int Lblock = lhs.size();
+  int Rblock = rhs.size();
+
+  GridBase *grid = lhs[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+
+  assert(mat.size()==Lblock*Rblock*Ngamma);
+  for(int t=0;t<mat.size();t++){
+    assert(mat[t].size()==Nt);
+  }
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock;
+  int MFlvol = ld*Lblock*Rblock;
+
+  Vector<SpinMatrix_v > lvSum(MFrvol);
+  parallel_for (int r = 0; r < MFrvol; r++){
+    lvSum[r] = Zero();
+  }
+
+  Vector<SpinMatrix_s > lsSum(MFlvol);             
+  parallel_for (int r = 0; r < MFlvol; r++){
+    lsSum[r]=scalar_type(0.0);
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  
+  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
+
+  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	for(int i=0;i<Lblock;i++){
+	  
+	  auto lhs_v=lhs[i].View();
+	  auto left = conjugate(lhs_v[ss]);
+	  for(int j=0;j<Rblock;j++){
+
+	    SpinMatrix_v vv;
+	    auto rhs_v = rhs[j].View();
+	    auto right = rhs_v[ss];
+	    for(int s1=0;s1<Ns;s1++){
+	    for(int s2=0;s2<Ns;s2++){
+	     vv()(s2,s1)() = left()(s1)(0) * right()(s2)(0)
+		+             left()(s1)(1) * right()(s2)(1)
+		+             left()(s1)(2) * right()(s2)(2);
+	    }}
+
+	    int idx = i+Lblock*j+Lblock*Rblock*r;
+
+	    lvSum[idx]=lvSum[idx]+vv;
+	  
+	  }
+	  }
+	}
+      }
+    }
+
+  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  parallel_for(int rt=0;rt<rd;rt++){
+
+    Coordinate icoor(Nd);
+    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+
+      int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
+
+      extract(lvSum[ij_rdx],extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx = i+Lblock*j+Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+
+      }
+    }}
+  }
+
+  std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
+  parallel_for(int t=0;t<fd;t++)
+  {
+    int pt = t / ld; // processor plane
+    int lt = t % ld;
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+      if (pt == grid->_processor_coor[orthogdim]){
+        int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
+    	for(int mu=0;mu<Ngamma;mu++){
+	  mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
+	}
+      }
+      else{
+        for(int mu=0;mu<Ngamma;mu++){
+	  mat[mu+i*Ngamma+j*Lblock*Ngamma][t] = scalar_type(0.0);
+	}
+      }
+    }}
+  }
+  std::cout << GridLogMessage << " Done "<<std::endl;
+  // defer sum over nodes.
+  return;
+}
+
+template<class vobj>
+void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &mat, 
+					 const std::vector<Lattice<vobj> > &lhs,
+					 const std::vector<Lattice<vobj> > &rhs,
+					 int orthogdim,
+					 std::vector<Gamma::Algebra> gammas,
+					 const std::vector<LatticeComplex > &mom) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef iSpinMatrix<vector_type> SpinMatrix_v;
+  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+  
+  int Lblock = lhs.size();
+  int Rblock = rhs.size();
+
+  GridBase *grid = lhs[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+  int Nmom   = mom.size();
+
+  assert(mat.size()==Lblock*Rblock*Ngamma*Nmom);
+  for(int t=0;t<mat.size();t++){
+    assert(mat[t].size()==Nt);
+  }
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock*Nmom;
+  int MFlvol = ld*Lblock*Rblock*Nmom;
+
+  Vector<SpinMatrix_v > lvSum(MFrvol);
+  parallel_for (int r = 0; r < MFrvol; r++){
+    lvSum[r] = Zero();
+  }
+
+  Vector<SpinMatrix_s > lsSum(MFlvol);             
+  parallel_for (int r = 0; r < MFlvol; r++){
+    lsSum[r]=scalar_type(0.0);
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  
+  std::cout << GridLogMessage << " Entering first parallel loop "<<std::endl;
+
+  // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+
+
+	for(int i=0;i<Lblock;i++){
+
+	  auto lhs_v = lhs[i].View();
+	  auto left = conjugate(lhs_v[ss]);
+	  for(int j=0;j<Rblock;j++){
+
+	    SpinMatrix_v vv;
+	    auto rhs_v = rhs[j].View();
+	    auto right = rhs_v[ss];
+	    for(int s1=0;s1<Ns;s1++){
+	    for(int s2=0;s2<Ns;s2++){
+	      vv()(s1,s2)() = left()(s1)(0) * right()(s2)(0)
+		+             left()(s1)(1) * right()(s2)(1)
+		+             left()(s1)(2) * right()(s2)(2);
+	    }}
+	    
+	    // After getting the sitewise product do the mom phase loop
+	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
+	    // Trigger unroll
+	    for ( int m=0;m<Nmom;m++){
+	      int idx = m+base;
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
+	      mac(&lvSum[idx],&vv,&phase);
+	    }
+	  
+	  }
+	}
+      }
+    }
+  }
+
+  std::cout << GridLogMessage << " Entering second parallel loop "<<std::endl;
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  parallel_for(int rt=0;rt<rd;rt++){
+
+    Coordinate icoor(Nd);
+    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
+
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int m=0;m<Nmom;m++){
+
+      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
+
+      extract(lvSum[ij_rdx],extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+
+      }
+    }}}
+  }
+
+  std::cout << GridLogMessage << " Entering third parallel loop "<<std::endl;
+  parallel_for(int t=0;t<fd;t++)
+  {
+    int pt = t / ld; // processor plane
+    int lt = t % ld;
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+      if (pt == grid->_processor_coor[orthogdim]){
+	for(int m=0;m<Nmom;m++){
+	  int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
+	  for(int mu=0;mu<Ngamma;mu++){
+	    mat[ mu
+		+m*Ngamma
+		+i*Nmom*Ngamma
+		+j*Nmom*Ngamma*Lblock][t] = trace(lsSum[ij_dx]*Gamma(gammas[mu]));
+	  }
+	}
+      }
+      else{
+	for(int mu=0;mu<Ngamma;mu++){
+	for(int m=0;m<Nmom;m++){
+	  mat[mu+m*Ngamma+i*Nmom*Ngamma+j*Nmom*Lblock*Ngamma][t] = scalar_type(0.0);
+	}}
+      }
+    }}
+  }
+  std::cout << GridLogMessage << " Done "<<std::endl;
+  // defer sum over nodes.
+  return;
+}
+
+
+
+/*
+template void sliceInnerProductMesonField<SpinColourVector>(std::vector< std::vector<ComplexD> > &mat, 
+						   const std::vector<Lattice<SpinColourVector> > &lhs,
+						   const std::vector<Lattice<SpinColourVector> > &rhs,
+						   int orthogdim) ;
+*/
+
+std::vector<Gamma::Algebra> Gmu4 ( {
+  Gamma::Algebra::GammaX,
+  Gamma::Algebra::GammaY,
+  Gamma::Algebra::GammaZ,
+  Gamma::Algebra::GammaT });
+
+std::vector<Gamma::Algebra> Gmu16 ( {
+  Gamma::Algebra::Gamma5,
+  Gamma::Algebra::GammaT,
+  Gamma::Algebra::GammaTGamma5,
+  Gamma::Algebra::GammaX,
+  Gamma::Algebra::GammaXGamma5,
+  Gamma::Algebra::GammaY,
+  Gamma::Algebra::GammaYGamma5,
+  Gamma::Algebra::GammaZ,
+  Gamma::Algebra::GammaZGamma5,
+  Gamma::Algebra::Identity,
+  Gamma::Algebra::SigmaXT,
+  Gamma::Algebra::SigmaXY,
+  Gamma::Algebra::SigmaXZ,
+  Gamma::Algebra::SigmaYT,
+  Gamma::Algebra::SigmaYZ,
+  Gamma::Algebra::SigmaZT
+});
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  auto latt_size   = GridDefaultLatt();
+  auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  auto mpi_layout  = GridDefaultMpi();
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  
+  const int Nmom=7;
+  int nt = latt_size[Tp];
+  uint64_t vol = 1;
+  for(int d=0;d<Nd;d++){
+    vol = vol*latt_size[d];
+  }
+  
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);
+  pRNG.SeedFixedIntegers(seeds);
+
+
+  int Nm = atoi(argv[1]); // number of all modes (high + low)
+
+  std::vector<LatticeFermion> v(Nm,&Grid);
+  std::vector<LatticeFermion> w(Nm,&Grid);
+  std::vector<LatticeFermion> gammaV(Nm,&Grid);
+  std::vector<LatticeComplex> phases(Nmom,&Grid);
+
+  for(int i=0;i<Nm;i++) { 
+    random(pRNG,v[i]);
+    random(pRNG,w[i]);
+  }
+
+  for(int i=0;i<Nmom;i++) { 
+    phases[i] = Complex(1.0);
+  }
+
+  double flops = vol * (11.0 * 8.0 + 6.0) * Nm*Nm;
+  double byte  = vol * (12.0 * sizeof(Complex) ) * Nm*Nm;
+
+  std::vector<ComplexD> ip(nt);
+  std::vector<std::vector<ComplexD> > MesonFields   (Nm*Nm);
+  std::vector<std::vector<ComplexD> > MesonFields4  (Nm*Nm*4);
+  std::vector<std::vector<ComplexD> > MesonFields16 (Nm*Nm*16);
+  std::vector<std::vector<ComplexD> > MesonFields161(Nm*Nm*16);
+  std::vector<std::vector<ComplexD> > MesonFields16mom (Nm*Nm*16*Nmom);
+  std::vector<std::vector<ComplexD> > MesonFieldsRef(Nm*Nm);
+
+  for(int i=0;i<MesonFields.size();i++   )  MesonFields   [i].resize(nt);
+  for(int i=0;i<MesonFieldsRef.size();i++)  MesonFieldsRef[i].resize(nt);
+  for(int i=0;i<MesonFields4.size();i++  )  MesonFields4  [i].resize(nt);
+  for(int i=0;i<MesonFields16.size();i++ )  MesonFields16 [i].resize(nt);
+  for(int i=0;i<MesonFields161.size();i++ ) MesonFields161[i].resize(nt);
+
+  for(int i=0;i<MesonFields16mom.size();i++ ) MesonFields16mom [i].resize(nt);
+
+  GridLogMessage.TimingMode(1);
+
+  std::cout<<GridLogMessage << "Running loop with sliceInnerProductVector"<<std::endl;
+  double t0 = usecond();
+  for(int i=0;i<Nm;i++) { 
+  for(int j=0;j<Nm;j++) { 
+    sliceInnerProductVector(ip, w[i],v[j],Tp);
+    for(int t=0;t<nt;t++){
+      MesonFieldsRef[i+j*Nm][t] = ip[t];
+    }
+  }}
+  double t1 = usecond();
+  std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
+
+  std::cout<<GridLogMessage << "Running loop with new code for Nt="<<nt<<std::endl;
+  t0 = usecond();
+  sliceInnerProductMesonField(MesonFields,w,v,Tp);
+  t1 = usecond();
+  std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
+
+
+  std::cout<<GridLogMessage << "Running loop with Four gammas code for Nt="<<nt<<std::endl;
+  flops = vol * (11.0 * 8.0 + 6.0) * Nm*Nm*4;
+  byte  = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
+        + vol * ( 2.0 * sizeof(Complex) ) * Nm*Nm* 4;
+  t0 = usecond();
+  sliceInnerProductMesonFieldGamma(MesonFields4,w,v,Tp,Gmu4);
+  t1 = usecond();
+  std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
+
+  std::cout<<GridLogMessage << "Running loop with Sixteen gammas code for Nt="<<nt<<std::endl;
+  flops = vol * (11.0 * 8.0 + 6.0) * Nm*Nm*16;
+  byte  = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
+        + vol * ( 2.0 * sizeof(Complex) ) * Nm*Nm* 16;
+  t0 = usecond();
+  sliceInnerProductMesonFieldGamma(MesonFields16,w,v,Tp,Gmu16);
+  t1 = usecond();
+  std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
+
+
+  std::cout<<GridLogMessage << "Running loop with Sixteen gammas code1 for Nt="<<nt<<std::endl;
+  flops = vol * ( 2 * 8.0 + 6.0) * Nm*Nm*16;
+  byte  = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
+        + vol * ( 2.0 * sizeof(Complex) ) * Nm*Nm* 16;
+  t0 = usecond();
+  sliceInnerProductMesonFieldGamma1(MesonFields161, w, v, Tp, Gmu16);
+  t1 = usecond();
+  std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
+
+  std::cout<<GridLogMessage << "Running loop with Sixteen gammas "<<Nmom<<" momenta "<<std::endl;
+  flops = vol * ( 2 * 8.0 + 6.0 + 8.0*Nmom) * Nm*Nm*16;
+  byte  = vol * (12.0 * sizeof(Complex) ) * Nm*Nm
+        + vol * ( 2.0 * sizeof(Complex) *Nmom ) * Nm*Nm* 16;
+  t0 = usecond();
+  sliceInnerProductMesonFieldGammaMom(MesonFields16mom,w,v,Tp,Gmu16,phases);
+  t1 = usecond();
+  std::cout<<GridLogMessage << "Done "<< (t1-t0) <<" usecond " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< flops/(t1-t0) <<" mflops " <<std::endl;
+  std::cout<<GridLogMessage << "Done "<< byte /(t1-t0) <<" MB/s " <<std::endl;
+
+
+
+  RealD err = 0;
+  RealD err2 = 0;
+  ComplexD diff;
+  ComplexD diff2;
+
+  for(int i=0;i<Nm;i++) { 
+  for(int j=0;j<Nm;j++) { 
+    for(int t=0;t<nt;t++){
+      diff = MesonFields[i+Nm*j][t] - MesonFieldsRef[i+Nm*j][t];
+      err += real(diff*conj(diff));
+    }
+  }}
+  std::cout<<GridLogMessage << "Norm error "<< err <<std::endl;
+  
+  err = err*0.;
+  diff = diff*0.;
+
+  for (int mu = 0; mu < 16; mu++){
+    for (int k = 0; k < gammaV.size(); k++){
+      gammaV[k] = Gamma(Gmu16[mu]) * v[k];
+    }
+    for (int i = 0; i < Nm; i++){
+      for (int j = 0; j < Nm; j++){
+        sliceInnerProductVector(ip, w[i], gammaV[j], Tp);
+        for (int t = 0; t < nt; t++){
+          MesonFields[i + j * Nm][t] = ip[t];
+          diff = MesonFields16[mu+i*16+Nm*16*j][t] - MesonFields161[mu+i*16+Nm*16*j][t];
+          diff2 = MesonFields[i+j*Nm][t] - MesonFields161[mu+i*16+Nm*16*j][t];
+          err += real(diff*conj(diff));
+          err2 += real(diff2*conj(diff2));
+        }
+      }
+    }
+  }
+  std::cout << GridLogMessage << "Norm error 16 gamma1/16 gamma naive    " << err << std::endl;
+  std::cout << GridLogMessage << "Norm error 16 gamma1/sliceInnerProduct " << err2 << std::endl;
+
+  Grid_finalize();
+}
+
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index f9f64b04..d24a3e25 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -124,6 +124,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
+
   for(int lat=LMIN;lat<=LMAX;lat+=LADD){
 
       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
@@ -187,5 +188,82 @@ int main (int argc, char ** argv)
 
     }
 
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+
+      for(int mu=0;mu<4;mu++){
+	      double start=usecond();
+	      for(int64_t i=0;i<Nloop;i++){
+	        z = PeriodicBC::CovShiftForward(x,mu,y);
+	    }
+	    double stop=usecond();
+	    double time = (stop-start)/Nloop*1000.0;
+	
+	
+	    double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	    double flops=Nc*Nc*(6+8+8)*vol;
+	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      }
+  }
+#if 1
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LADD){
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix tmp(&Grid);
+
+      for(int mu=0;mu<4;mu++){
+	double tshift=0;
+	double tmult =0;
+
+	double start=usecond();
+	for(int64_t i=0;i<Nloop;i++){
+	  tshift-=usecond();
+	  tmp = Cshift(y,mu,-1);
+	  tshift+=usecond();
+	  tmult-=usecond();
+	  z   = x*tmp;
+	  tmult+=usecond();
+	}
+	double stop=usecond();
+	double time = (stop-start)/Nloop;
+	tshift = tshift/Nloop;
+	tmult  = tmult /Nloop;
+	
+	double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	double flops=Nc*Nc*(6+8+8)*vol;
+	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
+	time = time * 1000; // convert to NS for GB/s
+	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      }
+    }
+#endif
   Grid_finalize();
 }
diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc
index 5ee57ebe..49939620 100644
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -4,7 +4,7 @@
 
     Source file: ./benchmarks/Benchmark_wilson.cc
 
-    Copyright (C) 2015
+    Copyright (C) 2018
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
@@ -32,6 +32,9 @@ using namespace std;
 using namespace Grid;
  ;
 
+
+#include "Grid/util/Profiling.h"
+
 template<class d>
 struct scal {
   d internal;
@@ -44,21 +47,40 @@ struct scal {
     Gamma::Algebra::GammaT
   };
 
+bool overlapComms = false;
+bool perfProfiling = false;
+
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
-  Coordinate latt_size   = GridDefaultLatt();
-  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-  Coordinate mpi_layout  = GridDefaultMpi();
+  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
+    overlapComms = true;
+  }
+  if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
+    perfProfiling = true;
+  }
+
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+
+
+  auto latt_size   = GridDefaultLatt();
+  auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  auto mpi_layout  = GridDefaultMpi();
+
   GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
   GridRedBlackCartesian     RBGrid(&Grid);
 
   int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  GridLogLayout();
+
   std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
   std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
   std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
+  std::cout<<GridLogMessage << "Grid number of colours : "<< Nc <<std::endl;
+  std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
+
 
   std::vector<int> seeds({1,2,3,4});
   GridParallelRNG          pRNG(&Grid);
@@ -135,9 +157,25 @@ int main (int argc, char ** argv)
     Dw.Dhop(src,result,0);
   }
   double t1=usecond();
-  double flops=1344*volume*ncall;
+  double flops=single_site_flops*volume*ncall;
+  
+  if (perfProfiling){
+  std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
+    
+  System::profile("kernel", [&]() {
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+  });
+
+  std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
+  std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
+
+  }
+
   
   std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+  std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
index c2378c40..a01a9e1f 100644
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -58,6 +58,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Number of colours "<< Nc <<std::endl;
   std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
   if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@@ -65,13 +66,15 @@ int main (int argc, char ** argv)
   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage << "* OpenMP threads       : "<< GridThread::GetThreads() <<std::endl;
+  std::cout << GridLogMessage << "* MPI tasks            : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
-  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
-  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
 
   int Lmax = 32;
   int dmin = 0;
@@ -93,12 +96,19 @@ int main (int argc, char ** argv)
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 	  LatticeFermion    src(&Grid); random(pRNG,src);
-	  LatticeFermion result(&Grid); result=Zero();
+	  LatticeFermion    src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
+	  LatticeFermion     result(&Grid); result=Zero();
+	  LatticeFermion result_e(&RBGrid); result_e=Zero();
 
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
       
+    // Full operator      
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
+	  std::cout << "\t";
+    // EO
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
@@ -117,9 +127,26 @@ void bench_wilson (
 		   int const           dag )
 {
   int ncall    = 1000;
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
   double t0    = usecond();
   for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
   double t1    = usecond();
-  double flops = 1344 * volume * ncall;
+  double flops = single_site_flops * volume * ncall;
+  std::cout << flops/(t1-t0) << "\t\t";
+}
+
+void bench_wilson_eo (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag )
+{
+  int ncall    = 1000;
+  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
+  double t0    = usecond();
+  for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
+  double t1    = usecond();
+  double flops = (single_site_flops * volume * ncall)/2.0;
   std::cout << flops/(t1-t0) << "\t\t";
 }
diff --git a/bootstrap.sh b/bootstrap.sh
index 573d90eb..cffe1b8c 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,16 +1,14 @@
 #!/usr/bin/env bash
 
-EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.4.tar.bz2'
+EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.5.tar.bz2'
 
 echo "-- deploying Eigen source..."
-cd lib
-rm -rf Eigen
-git clone https://github.com/eigenteam/eigen-git-mirror.git
-mv eigen-git-mirror/Eigen .
-rm -rf eigen-git-mirror
-echo 'eigen_files =\' > Eigen.inc
-find Eigen -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> Eigen.inc
-cd ..
+ARC=`basename ${EIGEN_URL}`
+wget ${EIGEN_URL} --no-check-certificate && ./scripts/update_eigen.sh ${ARC} && rm ${ARC}
+# patch for non-portable includes in Eigen 3.3.5
+# apparently already fixed in Eigen HEAD so it should not be 
+# a problem in the future (A.P.)
+patch Grid/Eigen/unsupported/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
 
 echo '-- generating Make.inc files...'
 ./scripts/filelist
diff --git a/configure.ac b/configure.ac
index adfb7d18..2a702a97 100644
--- a/configure.ac
+++ b/configure.ac
@@ -6,8 +6,8 @@ AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE([subdir-objects 1.13])
 AM_EXTRA_RECURSIVE_TARGETS([tests bench])
 AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_SRCDIR([lib/Grid.h])
-AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h])
+AC_CONFIG_SRCDIR([Grid/Grid.h])
+AC_CONFIG_HEADERS([Grid/Config.h],[sed -i 's|PACKAGE_|GRID_|' Grid/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 ################ Get git info
@@ -53,7 +53,7 @@ AC_TYPE_UINT64_T
 ############### OpenMP
 AC_OPENMP
 ac_openmp=no
-if test "${OPENMP_CXXFLAGS}X" != "X"; then
+if test "${ac_cv_prog_cxx_openmp}X" != "noX"; then
   ac_openmp=yes
   AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
   AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
@@ -101,6 +101,13 @@ AC_ARG_WITH([lime],
             [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
             [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
 
+############### OpenSSL
+AC_ARG_WITH([openssl],
+            [AS_HELP_STRING([--with-openssl=prefix],
+            [try this for a non-standard install prefix of the OpenSSL library])],
+            [AM_CXXFLAGS="-I$with_openssl/include $AM_CXXFLAGS"]
+            [AM_LDFLAGS="-L$with_openssl/lib $AM_LDFLAGS"])
+
 ############### lapack
 AC_ARG_ENABLE([lapack],
     [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
@@ -129,10 +136,13 @@ case ${ac_SFW_FP16} in
       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
 esac
 
-############### MKL
+############### Intel libraries
 AC_ARG_ENABLE([mkl],
     [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
     [ac_MKL=${enable_mkl}], [ac_MKL=no])
+AC_ARG_ENABLE([ipp],
+    [AC_HELP_STRING([--enable-ipp=yes|no|prefix], [enable Intel IPP for fast CRC32C])],
+    [ac_IPP=${enable_mkl}], [ac_IPP=no])
 
 case ${ac_MKL} in
     no)
@@ -145,6 +155,17 @@ case ${ac_MKL} in
         AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
 esac
 
+case ${ac_IPP} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
+    *)
+        AM_CXXFLAGS="-I$ac_IPP/include $AM_CXXFLAGS"
+        AM_LDFLAGS="-L$ac_IPP/lib $AM_LDFLAGS"
+        AC_DEFINE([USE_IPP], [1], [Define to 1 if you use the Intel IPP]);;
+esac
+
 ############### HDF5
 AC_ARG_WITH([hdf5],
     [AS_HELP_STRING([--with-hdf5=prefix],
@@ -176,7 +197,13 @@ AC_CHECK_FUNCS([gettimeofday])
 
 if test "${ac_MKL}x" != "nox"; then
     AC_SEARCH_LIBS([mkl_set_interface_layer], [mkl_rt], [],
-                   [AC_MSG_ERROR("MKL enabled but library not found")])
+                   [AC_MSG_ERROR("Intel MKL enabled but library not found")])
+fi
+
+if test "${ac_IPP}x" != "nox"; then
+     AC_SEARCH_LIBS([ippsCRC32C_8u], [ippdc], 
+                   [LIBS="${LIBS} -lippdc -lippvm -lipps -lippcore"],
+                   [AC_MSG_ERROR("Intel IPP enabled but library not found")])
 fi
 
 AC_SEARCH_LIBS([__gmpf_init], [gmp],
@@ -201,9 +228,13 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3],
 AC_SEARCH_LIBS([limeCreateReader], [lime],
                [AC_DEFINE([HAVE_LIME], [1], [Define to 1 if you have the `LIME' library])]
                [have_lime=true],
-	       [AC_MSG_WARN(C-LIME library was not found in your system.
-In order to use ILGG file format please install or provide the correct path to your installation
-Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
+	             [AC_MSG_ERROR(LIME library was not found in your system.)])
+
+AC_SEARCH_LIBS([SHA256_Init], [crypto],
+               [AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
+               [have_crypto=true],
+	             [AC_MSG_ERROR(OpenSSL library was not found in your system.)])
+AC_CHECK_HEADER([openssl/sha.h], [], [AC_MSG_ERROR(OpenSSL library found but without headers.)], [AC_INCLUDES_DEFAULT([])])
 
 AC_SEARCH_LIBS([crc32], [z],
                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
@@ -300,6 +331,9 @@ case ${ax_cv_cxx_compiler_vendor} in
       AVX512)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+      SKL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
+        SIMD_FLAGS='-march=skylake-avx512';;
       KNC)
         AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
         SIMD_FLAGS='';;
@@ -387,7 +421,7 @@ case ${ac_PRECISION} in
 esac
 
 ######################  Shared memory allocation technique under MPI3
-AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs],
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
               [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 
 case ${ac_SHM} in
@@ -396,6 +430,14 @@ case ${ac_SHM} in
      AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
      ;;
 
+     shmget)
+     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
+     ;;
+
+     shmnone)
+     AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
+     ;;
+
      hugetlbfs)
      AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
      ;;
@@ -409,7 +451,7 @@ esac
 AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
               [Select SHM mmap base path for hugetlbfs])],
 	      [ac_SHMPATH=${enable_shmpath}],
-	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
+	      [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
 AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 
 ############### communication type selection
@@ -513,19 +555,21 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
 
 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+GRID_CXX="$CXX"
 GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 GRID_LIBS=$LIBS
 GRID_SHORT_SHA=`git rev-parse --short HEAD`
 GRID_SHA=`git rev-parse HEAD`
 GRID_BRANCH=`git rev-parse --abbrev-ref HEAD`
-AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
-AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
-AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
+AM_CXXFLAGS="-I${abs_srcdir} $AM_CXXFLAGS"
+AM_CFLAGS="-I${abs_srcdir} $AM_CFLAGS"
+AM_LDFLAGS="-L${cwd}/Grid $AM_LDFLAGS"
 AC_SUBST([CXXLD])
 AC_SUBST([AM_CFLAGS])
 AC_SUBST([AM_CXXFLAGS])
 AC_SUBST([AM_LDFLAGS])
+AC_SUBST([GRID_CXX])
 AC_SUBST([GRID_CXXFLAGS])
 AC_SUBST([GRID_LDFLAGS])
 AC_SUBST([GRID_LIBS])
@@ -576,7 +620,7 @@ AC_SUBST([GRID_SUMMARY])
 
 AC_CONFIG_FILES([grid-config], [chmod +x grid-config])
 AC_CONFIG_FILES(Makefile)
-AC_CONFIG_FILES(lib/Makefile)
+AC_CONFIG_FILES(Grid/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
 AC_CONFIG_FILES(tests/IO/Makefile)
 AC_CONFIG_FILES(tests/core/Makefile)
@@ -590,8 +634,8 @@ AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
-AC_CONFIG_FILES(extras/Makefile)
-AC_CONFIG_FILES(extras/Hadrons/Makefile)
+AC_CONFIG_FILES(Hadrons/Makefile)
+AC_CONFIG_FILES(Hadrons/Utilities/Makefile)
 AC_OUTPUT
 
 echo ""
diff --git a/documentation/Grid.pdf b/documentation/Grid.pdf
new file mode 100644
index 00000000..8b9f2be1
Binary files /dev/null and b/documentation/Grid.pdf differ
diff --git a/documentation/Makefile b/documentation/Makefile
new file mode 100644
index 00000000..1c9c886f
--- /dev/null
+++ b/documentation/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = Grid
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/documentation/conf.py b/documentation/conf.py
new file mode 100644
index 00000000..dc055c4f
--- /dev/null
+++ b/documentation/conf.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/stable/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Grid'
+copyright = '2018, Peter Boyle, Guido Cossu, Antonin Portelli, Azusa Yamaguchi'
+author = 'Peter Boyle, Guido Cossu, Antonin Portelli, Azusa Yamaguchi'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = ''
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.todo',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.ifconfig',
+    'sphinx.ext.githubpages',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'manual'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path .
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+primary_domain = 'cpp'
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+html_use_smartypants = False
+smart_quotes = False
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Griddoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    #'papersize': 'a4paper',
+
+    'extraclassoptions': 'openany,oneside',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    'pointsize': '8pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    
+    # Latex figure (float) alignment
+    #
+    'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Grid.tex', ' Grid Documentation ',
+     '\includegraphics[width=.4\\textwidth]{logo.png} \\\\Peter Boyle, Guido Cossu, Antonin Portelli, Azusa Yamaguchi', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'grid', 'Grid Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Grid', 'Grid Documentation',
+     author, 'Grid', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Extension configuration -------------------------------------------------
+
+# -- Options for todo extension ----------------------------------------------
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
diff --git a/documentation/interfacing.rst b/documentation/interfacing.rst
new file mode 100644
index 00000000..79e44881
--- /dev/null
+++ b/documentation/interfacing.rst
@@ -0,0 +1,232 @@
+Interfacing with external software
+========================================
+
+Grid provides a number of important modules, such as solvers and
+eigensolvers, that are highly optimized for complex vector/SIMD
+architectures, such as the Intel Xeon Phi KNL and Skylake processors.
+This growing library, with appropriate interfacing, can be accessed
+from existing code. Here we describe interfacing issues and provide
+examples.
+
+	  
+MPI initialization
+--------------------
+
+Grid supports threaded MPI sends and receives and, if running with
+more than one thread, requires the MPI_THREAD_MULTIPLE mode of message
+passing. If the user initializes MPI before starting Grid, the
+appropriate initialization call is::
+
+  MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided);
+  assert(MPI_THREAD_MULTIPLE == provided);
+
+Grid Initialization
+---------------------
+
+Grid itself is initialized with a call::
+
+  Grid_init(&argc, &argv);
+
+Command line options include::
+
+  --mpi n.n.n.n   : default MPI decomposition
+  --threads n     : default number of OMP threads
+  --grid n.n.n.n  : default Grid size
+  
+where `argc` and `argv` are constructed to simulate the command-line
+options described above.  At a minimum one usually provides the
+`--grid` and `--mpi` parameters.  The former specifies the lattice
+dimensions and the latter specifies the grid of processors (MPI
+ranks).  If these parameters are not specified with the `Grid_init`
+call, they need to be supplied later when creating Grid fields.
+
+The following Grid procedures are useful for verifying that Grid
+"default" values are properly initialized.
+
+=============================================================   ===========================================================================================================
+  Grid procedure                                                  returns 
+=============================================================   ===========================================================================================================
+  std::vector<int> GridDefaultLatt();                            lattice size
+  std::vector<int> GridDefaultSimd(int Nd,vComplex::Nsimd());    SIMD layout
+  std::vector<int> GridDefaultMpi();                             MPI layout
+  int Grid::GridThread::GetThreads();                            number of threads
+=============================================================   ===========================================================================================================
+
+
+MPI coordination
+----------------
+
+Grid wants to use its own numbering of MPI ranks and its own
+assignment of the lattice coordinates with each rank.  Obviously, the
+calling program and Grid must agree on these conventions.  One should
+use Grid's Cartesian communicator class to discover the processor
+assignments. For a four-dimensional processor grid one can define::
+
+  static Grid::CartesianCommunicator *grid_cart = NULL;
+  grid_cart = new Grid::CartesianCommunicator(processors);
+
+where `processors` is of type `std::vector<int>`, with values matching
+the MPI processor-layout dimensions specified with the `--mpi`
+argument in the `Grid_Init` call.  Then each MPI rank can obtain its
+processor coordinate using the Cartesian communicator instantiated
+above.  For example, in four dimensions::
+
+  std::vector<int> pePos(4);    
+  for(int i=0; i<4; i++)
+     pePos[i] = grid_cart->_processor_coor[i];
+
+and each MPI process can get its world rank from its processor
+coordinates using::
+
+  int peRank = grid_cart->RankFromProcessorCoor(pePos)
+	  
+Conversely, each MPI process can get its processor coordinates from
+its world rank using::
+
+  grid_cart->ProcessorCoorFromRank(peRank, pePos);
+
+If the calling program initialized MPI before initializing Grid, it is
+then important for each MPI process in the calling program to reset
+its rank number so it agrees with Grid::
+
+   MPI_Comm comm;
+   MPI_Comm_split(MPI_COMM_THISJOB,jobid,peRank,&comm);
+   MPI_COMM_THISJOB = comm;
+
+where `MPI_COMM_THISJOB` is initially a copy of `MPI_COMM_WORLD` (with
+`jobid = 0`), or it is a split communicator with `jobid` equal to the
+index number of the subcommunicator.  Once this is done,::
+
+  MPI_Comm_rank(MPI_COMM_THISJOB, &myrank);
+
+returns a rank that agrees with Grid's `peRank`.
+
+QMP coordination
+----------------
+
+If the calling program uses the SciDAC QMP message-passing package, a
+call to QMP_comm_split() instead can be used to reassign the ranks.
+In the example below, `peGrid` gives the processor-grid dimensions,
+usually set on the command line with `-qmp-geom`.
+
+**Example**::
+  
+  int NDIM = QMP_get_allocated_number_of_dimensions();
+  Grid::Grid_init(argc,argv);
+  FgridBase::grid_initted=true;
+  std::vector<int> processors;
+  for(int i=0;i<NDIM;i++) processors.push_back(peGrid[i]);
+  Grid::CartesianCommunicator grid_cart(processors);
+  std::vector<int> pePos(NDIM);
+  for(int i=NDIM-1;i>=0;i--)
+     pePos[i] = grid_cart._processor_coor[i];
+  int peRank = grid_cart->RankFromProcessorCoor(pePos);
+  QMP_comm_split(QMP_comm_get_default(),0,peRank,&qmp_comm);
+  QMP_comm_set_default(qmp_comm);
+
+  
+Mapping fields between Grid and user layouts
+---------------------------------------------
+
+In order to map data between calling-program and Grid layouts, it is
+important to know how the lattice sites are distributed across the
+processor grid.  A lattice site with coordinates `r[mu]` is assigned
+to the processor with processor coordinates `pePos[mu]` according to
+the rule::
+
+  pePos[mu] = r[mu]/dim[mu]
+
+where `dim[mu]` is the lattice dimension in the `mu` direction.  For
+performance reasons, it is important that the external data layout
+follow the same rule.  Then data mapping can be done without
+requiring costly communication between ranks.  We assume this is the
+case here.
+
+When mapping data to and from Grid, one must choose a lattice object
+defined on the appropriate grid, whether it be a full lattice (4D
+`GridCartesian`), one of the checkerboards (4D
+`GridRedBlackCartesian`), a five-dimensional full grid (5D
+`GridCartesian`), or a five-dimensional checkerboard (5D
+`GridRedBlackCartesian`).  For example, an improved staggered-fermion
+color-vector field `cv` on a single checkerboard would be constructed
+using
+
+**Example**::
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian       RBGrid(&Grid);
+
+  typename ImprovedStaggeredFermion::FermionField  cv(RBGrid);
+
+The example above assumes that the grid default values were set in the
+`Grid_init` call.  If not, they can be set at this point and passed
+when `GridCartesian` is instantiated here.  To map data within an MPI
+rank, the external code must iterate over the sites belonging to that
+rank (full or checkerboard as appropriate).  Note that the site
+coordinates are specified relative to the origin of the lattice
+subvolume on that rank. To import data into Grid, the external data on
+a single site with coordinates `r` is first copied into the
+appropriate Grid scalar object `s`.  Then it is copied into the Grid
+lattice field `l` with `pokeLocalSite`::
+
+  pokeLocalSite(const sobj &s, Lattice<vobj> &l, Coordinate &r);
+
+To export data from Grid, the reverse operation starts with::
+
+  peekLocalSite(const sobj &s, Lattice<vobj> &l, Coordinate &r);
+
+and then copies the single-site data from `s` into the corresponding
+external type.
+
+Here is an example that maps a single site's worth of data in a MILC
+color-vector field to a Grid scalar ColourVector object `cVec` and from
+there to the lattice colour-vector field `cv`, as defined above.
+
+**Example**::
+
+  indexToCoords(idx,r);
+  ColourVector cVec;
+  for(int col=0; col<Nc; col++)
+      cVec()()(col) = 
+          Complex(src[idx].c[col].real, src[idx].c[col].imag);
+
+  pokeLocalSite(cVec, cv, r);
+
+Here the `indexToCoords()` function is a MILC mapping of the MILC site
+index `idx` to the 4D lattice coordinate `r`.
+
+Grid provides block- and multiple-rhs conjugate-gradient solvers. For
+this purpose it uses a 5D lattice. To map data to and from Grid data
+types, the index for the right-hand-side vector becomes the zeroth
+coordinate of a five-dimensional vector `r5`.  The remaining
+components of `r5` contain the 4D space-time coordinates.  The
+`pokeLocalSite/peekLocalSite` operations then accept the coordinate
+`r5`, provided the destination/source lattice object is also 5D.  In
+the example below data from a single site specified by `idx`,
+belonging to a set of `Ls` MILC color-vector fields, are copied into a
+Grid 5D fermion field `cv5`.
+
+**Example**::
+
+    GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt();
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid)  typename ImprovedStaggeredFermion5D::FermionField  cv5(FrbGrid);
+
+    std::vector<int> r(4);
+    indexToCoords(idx,r);
+    std::vector<int> r5(1,0);
+    for( int d = 0; d < 4; d++ ) r5.push_back(r[d]);
+
+    for( int j = 0; j < Ls; j++ ){
+      r5[0] = j;
+      ColourVector cVec;
+      for(int col=0; col<Nc; col++){
+	  cVec()()(col) = 
+	      Complex(src[j][idx].c[col].real, src[j][idx].c[col].imag);
+      }
+      pokeLocalSite(cVec, *(out->cv), r5);
+    }
+
diff --git a/documentation/logo.png b/documentation/logo.png
new file mode 100755
index 00000000..d104339f
Binary files /dev/null and b/documentation/logo.png differ
diff --git a/documentation/manual.rst b/documentation/manual.rst
new file mode 100644
index 00000000..1596de5e
--- /dev/null
+++ b/documentation/manual.rst
@@ -0,0 +1,3350 @@
+.. Grid documentation 
+.. highlight:: cpp
+
+Welcome to Grid's documentation!
+==================================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+Preliminaries
+====================================
+
+.. attention:: manual version 1 (CD)
+   
+Grid is primarily an *application* *development* *interface* (API) for structured Cartesian grid codes and written in C++11.
+In particular it is aimed at Lattice Field Theory simulations in general gauge theories, but
+with a particular emphasis on supporting SU(3) and U(1) gauge theories relevant to hadronic physics.
+
+Who will use this library
+---------------------------
+
+As an application development interface *Grid* is primarily a programmers tool providing the
+building blocks and primitives for constructing lattice gauge theory programmes. 
+
+Grid functionality includes:
+
+* Data parallel primitives, similar to QDP++
+
+* gauge and fermion actions 
+
+* solvers
+
+* gauge and fermion force terms
+
+* integrators and (R)HMC.
+
+* parallel field I/O 
+
+* object serialisation (text, XML, JSON...)
+
+Grid is intended to enable the rapid and easy development of code with reasonably competitive performance.
+
+It is first and foremost a *library* to which people can programme, and develop new algorithms and measurements.
+As such, it is very much hoped that peoples principle point of contact with Grid will be in
+the wonderfully rich C++ language. Since import and export procedures are provided for the opaque lattice types
+it should be possible to call Grid from other code bases. 
+
+Grid is most tightly coupled to the Hadrons package 
+developed principally by Antonin Portelli. 
+This package is entirely composed against the Grid data parallel interface.
+
+Interfacing to other packages is also possible.
+
+Several regression tests that combine Grid with Chroma are included in the Grid distribution.
+Further, Grid has been successfully interfaced to 
+
+* The Columbia Physics System
+
+* The MILC code
+
+
+Data parallel interface
+----------------------------------------------------------------------------------
+
+Most users will wish to interact with Grid above the data parallel *Lattice* interface. At this level
+a programme is simply written as a series of statements, addressing entire lattice objects. 
+
+
+Implementation details may be provided to explain how the code works, but are not strictly part of the API.
+
+**Example**
+
+   For example, as an implementation detail, in a single programme multiple data (SPMD) message passing supercomputer the main programme is trivially replicated on each computing node. The data parallel operations are called *collectively* by all nodes. Any scalar values returned by the various reduction routines are the same on each node, resulting in (for example) the same decision being made by all nodes to terminate an iterative solver on the same iteration. 
+
+
+
+Internal development
+------------------------------------------
+Internal developers may contribute to Grid at a level below the data parallel interface.
+
+Specifically, development of new lattice Dirac operators, for example, 
+or any codes directly interacting with the 
+
+* Communicators
+
+* Simd 
+
+* Tensor
+
+* Stencil 
+
+will make use of facilities provided by to assist the creation of high performance code. 
+The internal data layout complexities
+will be exposed to some degree and the interfaces are subject to change without notice as HPC architectures change.
+
+Since some of the internal implementation details are needed to explain the design strategy of grid these will be 
+documented, but labelled as *implementation dependent*
+
+Reasonable endeavours will be made to preserve functionality where practical but no guarantees are made.
+
+Reporting Bugs
+===================================
+
+To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers.
+
+When you file an issue, please go though the following checklist:
+
+* Check that the code is pointing to the HEAD of develop or any commit in master which is tagged with a version number.
+
+* Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example cat /proc/cpuinfo | grep 'model name' | uniq (Linux) or sysctl machdep.cpu.brand_string (macOS) and the full output the --version option of your compiler.
+
+* Give the exact configure command used.
+
+* Attach config.log.
+
+* Attach grid.config.summary.
+
+* Attach the output of make V=1.
+
+* Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
+
+Download, installation and build
+=========================================
+
+Required libraries
+----------------------------------------
+
+* GMP,
+
+* MPFR
+
+* Eigen: bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
+
+Grid optionally uses:
+
+* HDF5
+
+* LIME for ILDG and SciDAC file format support.
+
+* FFTW either generic version or via the Intel MKL library.
+
+* LAPACK either generic version or Intel MKL library.
+
+
+Compilers
+---------
+
+* Intel ICPC v17 and later
+
+* Clang v3.5 and later (need 3.8 and later for OpenMP)
+
+* GCC v4.9.x 
+
+* GCC v6.3 and later (recommended)
+
+**Important:**
+
+Some versions of GCC appear to have a bug under high optimisation (-O2, -O3).
+
+The safety of these compiler versions cannot be guaranteed at this time. Follow Issue 100 for details and updates.
+
+GCC v5.x
+
+GCC v6.1, v6.2
+
+Quick start
+------------
+First, start by cloning the repository::
+
+  git clone https://github.com/paboyle/Grid.git
+
+Then enter the cloned directory and set up the build system::
+
+  cd Grid
+  ./bootstrap.sh
+
+Now you can execute the `configure` script to generate makefiles (here from a build directory)::
+
+  mkdir build; cd build
+  ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \
+      --prefix=<path>
+
+where::
+
+  --enable-precision=single|double
+
+sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations,
+and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.::
+
+   --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX
+
+sets the **SIMD architecture**, ::
+
+   --enable-comms=mpi|none
+
+selects whether to use MPI communication (mpi) or no communication (none). ::
+
+   --prefix=<path>
+
+should be passed the prefix path where you want to install Grid. 
+
+Other options are detailed in the next section, you can also use ::
+
+   configure --help
+
+to display them. 
+
+Like with any other program using GNU autotool, the ::
+
+   CXX, CXXFLAGS, LDFLAGS, ... 
+
+environment variables can be modified to customise the build.
+
+Finally, you can build, check, and install Grid::
+
+   make;
+   make check;
+   make install
+
+
+If you want to build all the tests just use `make tests`.
+
+Detailed build configuration options
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================================  ==============================================================================================================================
+ Option                                     usage
+========================================  ==============================================================================================================================
+  ``--prefix=path``                           installation prefix for Grid.
+  ``--with-gmp=path``                         look for GMP in the UNIX prefix `<path>`
+ ``--with-mpfr=path``                        look for MPFR in the UNIX prefix `<path>`
+ ``--with-fftw=path``                        look for FFTW in the UNIX prefix `<path>`
+ ``--with-lime=path``                       look for c-lime in the UNIX prefix `<path>`
+ ``--enable-lapack[=path]``                  enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
+  --enable-mkl[=path]                     use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
+  --enable-simd=code                      setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below.
+  --enable-gen-simd-width=size            select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes.
+  --enable-precision=single|double        set the default precision (default: `double`).
+  --enable-comms=mpi|none                 use `<comm>` for message passing (default: `none`).
+  --enable-rng=sitmo|ranlux48|mt19937     choose the RNG (default: `sitmo`).
+  --disable-timers                        disable system dependent high-resolution timers.
+  --enable-chroma                         enable Chroma regression tests.
+  --enable-doxygen-doc                    enable the Doxygen documentation generation (build with `make doxygen-doc`)
+========================================  ==============================================================================================================================
+
+
+Possible communication interfaces
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following options can be use with the `-\\-enable-comms=` option to target different communication interfaces:
+
+===============    ==========================================================================================
+ <comm>            Description            
+===============    ==========================================================================================
+ `none`            no communications    
+ `mpi`             MPI communications with compiler CXX
+ `mpi-auto`        MPI communications with compiler CXX but clone flags from MPICXX
+===============    ==========================================================================================
+
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard wrappers ( `CXX=CC` ) set up by Cray `PrgEnv` modules instead.  
+
+
+Possible SIMD types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following options can be use with the `-\\-enable-simd=` option to target different SIMD instruction sets:
+
+============    =====================================================================================================================
+  `<simd>`      Description                             
+============    =====================================================================================================================
+  `GEN`         generic portable vector code            
+  `SSE4`        SSE 4.2 (128 bit)                       
+  `AVX`         AVX (256 bit)                           
+  `AVXFMA`      AVX (256 bit) + FMA                     
+  `AVXFMA4`     AVX (256 bit) + FMA4                    
+  `AVX2`        AVX 2 (256 bit)                         
+  `AVX512`      AVX 512 bit                             
+  `NEONv8`      [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                      
+  `QPX`         IBM QPX (256 bit)                       
+============    =====================================================================================================================
+
+Alternatively, some CPU codenames can be directly used:
+
+============    =====================================================================================================================
+  `<simd>`      Description                             
+============    =====================================================================================================================
+  `KNL`         [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing)  
+  `SKL`         [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server)  
+  `BGQ`         Blue Gene/Q                             
+============    =====================================================================================================================
+
+
+Notes
+^^^^^^^
+* We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future 
+  versions of Grid when the AVX512 support in the compiler is more advanced.
+* For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
+* BG/Q performances are currently rather poor. This is being investigated for future versions.
+* The vector size for the `GEN` target can be specified with the `configure` script option `-\\-enable-gen-simd-width`.
+
+Build setup for Intel Knights Landing platform
+---------------------------------------------------------------------------------------
+
+The following configuration is recommended for the Intel Knights Landing platform::
+
+  ../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi-auto  \
+             --enable-mkl             \
+             CXX=icpc MPICXX=mpiicpc
+
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
+
+  ../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi       \
+             --enable-mkl             \
+             CXX=CC CC=cc
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed::
+
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       
+
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Knight's Landing with Intel Omnipath adapters with two adapters per node 
+presently performs better with use of more than one rank per node, using shared memory 
+for interior communication.
+We recommend four ranks per node for best performance, but optimum is local volume dependent. ::
+
+   ../configure --enable-precision=double\
+             --enable-simd=KNL        \
+             --enable-comms=mpi-auto \
+             --enable-mkl             \
+             CC=icpc MPICXX=mpiicpc 
+
+Build setup for Intel Haswell Xeon platform
+---------------------------------------------------------------------------------------
+
+The following configuration is recommended for the Intel Haswell platform::
+
+  ../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi-auto \
+             --enable-mkl             \
+             CXX=icpc MPICXX=mpiicpc
+
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed::
+
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       
+
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
+
+  ../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using::
+
+        export I_MPI_PIN=1
+
+This is the default.
+
+Build setup for Intel Skylake Xeon platform
+----------------------------------------------------------------------------
+
+
+The following configuration is recommended for the Intel Skylake platform::
+
+  ../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi      \
+             --enable-mkl             \
+             CXX=mpiicpc
+
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed::
+
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use::
+
+  ../configure --enable-precision=double\
+             --enable-simd=AVX512     \
+             --enable-comms=mpi      \
+             --enable-mkl             \
+             CXX=CC CC=cc
+
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using::
+
+        export I_MPI_PIN=1
+
+This is the default. 
+
+Build setup for AMD EPYC / RYZEN
+----------------------------------------------------------------------------
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank. 
+
+The following configuration is recommended for the AMD EPYC platform::
+
+
+  ../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi \
+             CXX=mpicxx 
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed::
+
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       
+
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI and
+shared memory to communicate within this 
+
+.. describe::  command line
+
+  mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
+
+	  
+Where omp_bind.sh does the following::
+
+  #!/bin/bash
+
+  numanode=` expr $PMI_RANK % 8 `
+  basecore=`expr $numanode \* 16`
+  core0=`expr $basecore + 0 `
+  core1=`expr $basecore + 2 `
+  core2=`expr $basecore + 4 `
+  core3=`expr $basecore + 6 `
+  core4=`expr $basecore + 8 `
+  core5=`expr $basecore + 10 `
+  core6=`expr $basecore + 12 `
+  core7=`expr $basecore + 14 `
+
+  export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+  echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+  $@
+
+
+Build setup for laptops, other compilers, non-cluster builds
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
+and omit the enable-mkl flag. 
+
+Single node builds are enabled with::
+
+            --enable-comms=none
+
+
+FFTW support that is not in the default search path may then enabled with::
+
+    --with-fftw=<installpath>
+
+BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
+
+
+Execution model 
+============================================
+
+Grid is intended to support performance portability across a many of platforms ranging from single processors
+to message passing CPU clusters and accelerated computing nodes.
+
+The library provides data parallel C++ container classes with internal memory layout that is transformed to map efficiently to SIMD architectures. CSHIFT facilities are provided, similar to HPF and cmfortran, and user control is given over the mapping of array indices to both MPI tasks and SIMD processing elements.
+
+Identically shaped arrays then be processed with perfect data parallelisation.
+Such identically shaped arrays are called conformable arrays.
+The transformation is based on the observation that Cartesian array processing involves identical processing to be performed on different regions of the Cartesian array.
+
+The library will both geometrically decompose into MPI tasks and across SIMD lanes. Local vector loops are parallelised with OpenMP pragmas.
+
+Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification for most programmers.
+
+The two broad optimisation targets are:
+
+* MPI, OpenMP, and SIMD parallelism 
+
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), and AVX512 (512 bits) targets are supported
+with aggressive use of architecture vectorisation intrinsic functions.
+
+* MPI between nodes with and data parallel offload to GPU's.
+
+For the latter generic C++ code is used both on the host and on the GPU, with a common vectorisation
+granularity.
+
+Accelerator memory model
+----------------------------------------------
+For accelerator targets it is assumed that heap allocations can be shared between the CPU
+and the accelerator. This corresponds to lattice fields having their memory allocated with 
+*cudaMallocManaged* with Nvidia GPU's. 
+
+Grid does not assume that stack or data segments share a common address space with an accelerator.
+
+* This constraint presently rules out porting Grid to AMD GPU's which do not support managed memory.
+
+* At some point in the future a cacheing strategy may be implemented to enable running on AMD GPU's
+
+Data parallel API
+=====================================
+
+Data parallel array indices are divided into two types. 
+
+* Internal indices, such as complex, colour, spin degrees of freedom 
+
+* spatial (space-time) indices.
+
+The ranges of all internal degrees are determined by template parameters, 
+and known at compile time. The ranges of spatial indices are dynamic, run time
+values and the Cartesian structure information is contained and accessed via `Grid` objects.
+
+Grid objects are the controlling entity for the decomposition of a distributed `Lattice`
+array across MPI tasks, nodes, SIMD lanes, accelerators. Threaded loops are used
+as appropriate on host code.
+
+(binary) Data parallel operations can only be performed between Lattice objects constructed
+from the same Grid pointer. These are called `conformable` operations.
+
+We will focus initially on the internal indices as these are the building blocks assembled
+in Lattice container classes. Every Lattice container class constructor requires a Grid object 
+pointer. 
+
+Tensor classes
+----------------------------
+
+The Tensor data structures are built up from fundamental 
+scalar matrix and vector classes::
+
+    template<class vobj      > class iScalar { private: vobj _internal ; } 
+    template<class vobj,int N> class iVector { private: vobj _internal[N] ; } 
+    template<class vobj,int N> class iMatrix { private: vobj _internal[N][N] ; }
+
+
+These are template classes and can be passed a fundamental scalar or vector type, or
+nested to form arbitrarily complicated tensor products of indices. All mathematical expressions
+are defined to operate recursively, index by index.
+
+Presently the constants
+
+* Nc
+* Nd 
+
+are globally predefined. However, this is planned for changed in future and policy classes
+for different theories (e.g. QCD, QED, SU2 etc...) will contain these constants and enable multiple
+theories to coexist more naturally.
+
+Arbitrary tensor products of fundamental scalar, vector
+and matrix objects may be formed in principle by the basic Grid code. 
+
+For Lattice field theory, we define types according to the following tensor
+product structure ordering. The suffix "D" indicates either double types, and
+replacing with "F" gives the corresponding single precision type.
+
+The test cases have R, which takes the compiled default precision (either F or D).
+This is for convenience only and may be deprecated in future forcing code external
+to Grid to choose the specific word size.
+
+Type definitions are provided in qcd/QCD.h to give the internal index structures
+of QCD codes. For example::
+
+    template<typename vtype> 
+    using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
+    using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+    using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+    using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+    using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+    using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+    using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
+    using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
+    using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+    using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
+    using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+
+Giving the type table:
+
+=======   =======    ======  ======  ===========  =======================
+Lattice   Lorentz    Spin    Colour  scalar_type   Field
+=======   =======    ======  ======  ===========  =======================
+Scalar    Scalar     Scalar  Scalar  RealD         RealD
+Scalar    Scalar     Scalar  Scalar  ComplexD      ComplexD
+Scalar    Scalar     Scalar  Matrix  ComplexD      ColourMatrixD
+Scalar    Vector     Scalar  Matrix  ComplexD      LorentzColourMatrixD
+Scalar    Scalar     Vector  Vector  ComplexD      SpinColourVectorD
+Scalar    Scalar     Vector  Vector  ComplexD      HalfSpinColourVectorD
+Scalar    Scalar     Matrix  Matrix  ComplexD      SpinColourMatrixD
+=======   =======    ======  ======  ===========  =======================
+
+The types are implemented via a recursive tensor nesting system.
+
+**Example**
+
+Here, the prefix "i" indicates for internal use, preserving the template nature of the class. 
+Final types are declared with vtype selected to be both scalar and vector, appropriate to a
+single datum, or stored in a partial SoA transformed lattice object::
+
+
+    // LorentzColour
+    typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
+    typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
+    typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
+
+    typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
+    typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
+    typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
+
+
+Arbitrarily deep tensor nests may be formed. Grid uses a positional and numerical rule to associate indices for contraction
+in the Einstein summation sense.
+
+=============  =======   ========
+Symbolic name   Number   Position
+=============  =======   ========
+LorentzIndex     0        left
+SpinIndex        1        middle
+ColourIndex      2        right
+=============  =======   ========
+
+The conventions are that the index ordering left to right are: Lorentz, Spin, Colour. A scalar type (either real
+or complex, single or double precision) is be provided to the innermost structure.
+
+
+Tensor arithmetic rules (lib/tensors/Tensor_arith.h)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Arithmetic rules are defined on these types
+
+The multiplication operator follows the natural multiplication
+table for each index, index level by index level.
+
+`Operator *`
+
+==  == ==  == 
+x   S  V   M  
+==  == ==  == 
+S   S  V   M
+V   S  S   V
+M   M  V   M
+==  == ==  == 
+
+The addition and subtraction rules disallow a scalar to be added to a vector,
+and vector to be added to matrix. A scalar adds to a matrix on the diagonal.
+
+*Operator* + and *Operator* -
+
+===  == ==  == 
++/-   S  V   M  
+===  == ==  == 
+S    S       M
+V       V     
+M    M       M
+===  == ==  == 
+
+The rules for a nested objects are recursively inferred level by level from basic rules of multiplication
+addition and subtraction for scalar/vector/matrix. Legal expressions can only be formed between objects
+with the same number of nested internal indices. All the Grid QCD datatypes have precisely three internal 
+indices, some of which may be trivial scalar to enable expressions to be formed.
+
+Arithmetic operations are possible where the left or right operand is a scalar type.
+
+**Example**::
+
+    LatticeColourMatrixD U(grid);
+    LatticeColourMatrixD Udag(grid);
+
+    Udag = adj(U);
+    
+    RealD unitary_err = norm2(U*adj(U) - 1.0);
+
+Will provide a measure of how discrepant from unitarity the matrix U is.
+
+Internal index manipulation (lib/tensors/Tensor_index.h)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+General code can access any specific index by number with a peek/poke semantic::
+
+   // peek index number "Level" of a vector index
+   template<int Level,class vtype>  auto peekIndex (const vtype &arg,int i);
+
+   // peek index number "Level" of a vector index
+   template<int Level,class vtype>  auto peekIndex (const vtype &arg,int i,int j);
+ 
+   // poke index number "Level" of a vector index
+   template<int Level,class vtype>  
+   void pokeIndex (vtype &pokeme,arg,int i) 
+
+   // poke index number "Level" of a matrix index
+   template<int Level,class vtype>  
+   void pokeIndex (vtype &pokeme,arg,int i,int j) 
+   
+**Example**::
+
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+    }
+
+Similar to the QDP++ package convenience routines are provided to access specific elements of
+vector and matrix internal index types by physics name or meaning aliases for the above routines
+with the appropriate index constant.
+
+* peekColour
+* peekSpin
+* peekLorentz
+
+and
+
+* pokeColour
+* pokeSpin
+* pokeLorentz
+
+For example, we often store Gauge Fields with a Lorentz index, but can split them into
+polarisations in relevant pieces of code.
+
+**Example**::
+
+    for (int mu = 0; mu < Nd; mu++) {
+      U[mu] = peekLorentz(Umu, mu);
+    }
+       
+For convenience, direct access as both an l-value and an r-value is provided by the parenthesis operator () on each of the Scalar, Vector and Matrix classes.
+For example one may write
+
+**Example**::
+
+  ColourMatrix A, B;
+
+  A()()(i,j) = B()()(j,i);
+
+bearing in mind that empty parentheses are need to address a scalar entry in the tensor index nest. 
+
+The first (left) empty parentheses move past the (scalar) Lorentz level in the tensor nest, and the second
+(middle) empty parantheses move past the (scalar) spin level. The (i,j) index the colour matrix.
+
+Other examples are easy to form for the many cases, and should be obvious to the reader.
+This form of addressing is convenient and saves peek, modifying, poke 
+multiple temporary objects when both spin and colour indices are being accessed.
+There are many cases where multiple lines of code are required with a peek/poke semantic which are
+easier with direct l-value and r-value addressing.
+
+Matrix operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Transposition and tracing specific internal indices are possible using::
+
+  template<int Level,class vtype>  
+  auto traceIndex (const vtype &arg)
+
+  template<int Level,class vtype>  
+  auto transposeIndex (const vtype &arg)
+
+These may be used as
+
+**Example**::
+
+  LatticeColourMatrixD Link(grid);
+  ComplexD link_trace = traceIndex<ColourIndex> (Link);
+
+Again, convenience aliases for QCD naming schemes are provided via       
+
+* traceColour
+* traceSpin
+
+* transposeColour
+* transposeSpin
+
+**Example**::
+
+  ComplexD link_trace = traceColour (Link);
+
+The operations only makes sense for matrix and scalar internal indices.
+
+The trace and transpose over all indices is also defined for matrix and scalar types::
+
+   template<class vtype,int N> 
+   auto trace(const iMatrix<vtype,N> &arg) -> iScalar
+
+   template<class vtype,int N> 
+   auto transpose(const iMatrix<vtype,N> &arg  ) -> iMatrix
+
+Similar functions are:
+
+* conjugate
+* adjoint
+
+
+The traceless anti-Hermitian part is taken with::
+
+    template<class vtype,int N> iMatrix<vtype,N> 
+    Ta(const iMatrix<vtype,N> &arg)
+
+SU(N) Reunitarisation (or reorthogonalisation) is enabled by::
+  
+    template<class vtype,int N> iMatrix<vtype,N> 
+    ProjectOnGroup(const iMatrix<vtype,N> &arg)
+
+**Example**::
+
+  LatticeColourMatrixD Mom(grid);
+  LatticeColourMatrixD TaMom(grid);
+
+  TaMom  = Ta(Mom);
+
+
+Querying internal index structure
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Templated code may find it useful to use query functions on the Grid datatypes they are provided.
+For example general Serialisation and I/O code can inspect the nature of a type a routine has been
+asked to read from disk, or even generate descriptive type strings::
+
+      ////////////////////////////////////////////////////
+      // Support type queries on template params:
+      ////////////////////////////////////////////////////
+      // int _ColourScalar  =  isScalar<ColourIndex,vobj>();
+      // int _ColourVector  =  isVector<ColourIndex,vobj>();
+      // int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
+      template<int Level,class vtype>  int isScalar(void)
+      template<int Level,class vtype>  int isVector(void)
+      template<int Level,class vtype>  int isMatrix(void)
+
+**Example** (lib/parallelIO/IldgIO.h)::
+
+  template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) { 
+
+  /////////////////////////////////////////
+  // Encode a generic tensor as a string
+  /////////////////////////////////////////
+
+  typedef typename getPrecision<vobj>::real_scalar_type stype;
+
+  int _ColourN       = indexRank<ColourIndex,vobj>();
+  int _ColourScalar  =  isScalar<ColourIndex,vobj>();
+  int _ColourVector  =  isVector<ColourIndex,vobj>();
+  int _ColourMatrix  =  isMatrix<ColourIndex,vobj>();
+
+  int _SpinN       = indexRank<SpinIndex,vobj>();
+  int _SpinScalar  =  isScalar<SpinIndex,vobj>();
+  int _SpinVector  =  isVector<SpinIndex,vobj>();
+  int _SpinMatrix  =  isMatrix<SpinIndex,vobj>();
+
+  int _LorentzN       = indexRank<LorentzIndex,vobj>();
+  int _LorentzScalar  =  isScalar<LorentzIndex,vobj>();
+  int _LorentzVector  =  isVector<LorentzIndex,vobj>();
+  int _LorentzMatrix  =  isMatrix<LorentzIndex,vobj>();
+
+  std::stringstream stream;
+
+  stream << "GRID_";
+  stream << ScidacWordMnemonic<stype>();
+
+  if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
+  if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
+
+  if ( _SpinVector )   stream << "_SpinVector"<<_SpinN;
+  if ( _SpinMatrix )   stream << "_SpinMatrix"<<_SpinN;
+
+  if ( _ColourVector )   stream << "_ColourVector"<<_ColourN;
+  if ( _ColourMatrix )   stream << "_ColourMatrix"<<_ColourN;
+
+  if ( _ColourScalar && _LorentzScalar && _SpinScalar )   stream << "_Complex";
+
+  typesize = sizeof(typename vobj::scalar_type);
+
+  if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
+  else                 typesize*= _ColourN;
+
+  if ( _SpinMatrix )   typesize*= _SpinN*_SpinN;
+  else                 typesize*= _SpinN;
+
+  };
+
+Inner and outer products
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We recursively define (tensors/Tensor_inner.h), ultimately returning scalar in all indices::
+
+  /////////////////////////////////////////////////////////////////////////
+  // innerProduct Scalar x Scalar -> Scalar
+  // innerProduct Vector x Vector -> Scalar
+  // innerProduct Matrix x Matrix -> Scalar
+  /////////////////////////////////////////////////////////////////////////
+  template<class l,class r>       
+  auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs)
+
+  template<class l,class r,int N> 
+  auto innerProductD (const iVector<l,N>& lhs,const iVector<r,N>& rhs)
+
+  template<class l,class r,int N> 
+  auto innerProductD (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs)
+
+  template<class l,class r>       
+  auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs)
+
+  template<class l,class r,int N> 
+  auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs)
+
+  template<class l,class r,int N> 
+  auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs)
+
+The sum is always performed in double precision for the innerProductD variant.
+
+We recursively define (tensors/Tensor_outer.h)::
+
+  /////////////////////////////////////////////////////////////////////////
+  // outerProduct Scalar x Scalar -> Scalar
+  //              Vector x Vector -> Matrix
+  /////////////////////////////////////////////////////////////////////////
+  template<class l,class r> 
+  auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs)
+
+  template<class l,class r,int N> 
+  auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs)
+
+
+Functions of Tensor
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following unary functions are defined, which operate element by element on a tensor 
+data structure::
+
+  sqrt();
+  rsqrt();
+  sin();
+  cos();
+  asin();
+  acos();
+  log();
+  exp();
+  abs();
+  Not();
+  toReal();
+  toComplex();
+
+Element wise functions are defined for::
+
+  div(tensor,Integer);
+  mod(tensor,Integer);
+  pow(tensor,RealD);
+
+Matrix exponentiation (as opposed to element wise exponentiation is implemented via power series in::
+
+    Exponentiate(const Tensor &r  ,RealD alpha, Integer Nexp = DEFAULT_MAT_EXP)
+
+the exponentiation is distributive across vector indices (i.e. proceeds component by component for a LorentzColourMatrix).
+
+Determinant is similar::
+
+    iScalar Determinant(const Tensor &r )
+
+Vectorisation
+--------------
+
+Internally, Grid defines a portable abstraction SIMD vectorisation, via the following types:
+
+* vRealF
+
+* vRealD
+
+* vComplexF
+
+* vComplexD
+
+These have the usual range of arithmetic operators and functions acting upon them. They do not form
+part of the API, but are mentioned to (partially) explain the need for controlling the
+layout transformation in lattice objects. They contain a number consecutive elements of the appropriate
+Real/Complex type, where number is architecture depemendent. The number may be queried at runtime using::
+
+	     vComplexF::Nsimd();
+
+The layout transformations in indexing functions in the Grid objects as completely parameterised by this Nsimd().
+They are documented further in the Internals chapter.
+
+Coordinates
+------------
+
+The Grid is define on a N-dimensional set of integer coordinates. 
+
+The maximum dimension is eight, and indexes in this space make use of the Coordinate class.
+The coordinate class shares a similar interface to `std::vector<int>`, but contains all data within the
+object, and has a fixed maximum length (template parameter).
+
+**Example**::
+
+	const int Nd=4;
+	Coordinate point(Nd);
+
+	for(int i=0;i<Nd;i++) 
+	  point[i] = 1;
+
+	std::cout<< point <<std::endl;
+
+	point.resize(3);
+
+	std::cout<< point <<std::endl;
+
+This enables the coordinates to be manipulated without heap allocation or thread contention,
+and avoids introducing STL functions into GPU code, but does so at the expense of introducing
+a maximum dimensionality. This limit is easy to change (lib/util/Coordinate.h).
+
+
+Grids
+-------------
+
+A Grid object defines the geometry of a global cartesian array, and through inheritance
+provides access to message passing decomposition, the local lattice, and the message passing primitives.
+
+The constructor requires parameters to indicate how the spatial (and temporal) indices
+are decomposed across MPI tasks and SIMD lanes of the vector length.
+We use a partial vectorisation transformation, must select
+which space-time dimensions participate in SIMD vectorisation.
+The Lattice containers are defined to have opaque internal layout, hiding this layout transformation.
+	  
+We define GridCartesian and GridRedBlackCartesian which both inherit from GridBase::
+
+    class GridCartesian        : public GridBase 
+    class GridRedBlackCartesian: public GridBase 
+
+The simplest Cartesian Grid constructor distributes across `MPI_COMM_WORLD`::
+
+    /////////////////////////////////////////////////////////////////////////
+    // Construct from comm world
+    /////////////////////////////////////////////////////////////////////////
+    GridCartesian(const Coordinate &dimensions,
+  		  const Coordinate &simd_layout,
+		  const Coordinate &processor_grid);
+
+A second constructor will create a child communicator from a previously declared Grid.
+This allows to subdivide the processor grid, and also to define lattices of differing dimensionalities and sizes,
+useful for both Chiral fermions, lower dimensional operations, and multigrid::
+
+    /////////////////////////////////////////////////////////////////////////
+    // Constructor takes a parent grid and possibly subdivides communicator.
+    /////////////////////////////////////////////////////////////////////////
+    GridCartesian(const Coordinate &dimensions,
+	          const Coordinate &simd_layout,
+	          const Coordinate &processor_grid,
+		  const GridCartesian &parent,int &split_rank);
+
+The Grid object provides much `internal` functionality to map a lattice site to 
+a node and lexicographic index. These are not needed by code interfacing
+to the data parallel layer.
+
+When the requested processor grid is smaller than the parent's processor grid, multiple copies of the
+same geometry communicator are created, indexed by spli_rank. This can be convenient to split
+a job into multiple independent sub jobs (a long present feature of MPI). It can be particularly
+effective in valence analysis, where for example, many inversions are performed on a single configuration.
+These can be made on smaller communicators in parallel and communications overheads minimised. Routines::
+
+      Grid_split
+      Grid_unsplit
+
+are provided to communicate fields between different communicators (e.g. between inversion and contraction phases).
+
+**Example** (tests/solver/Test_split_grid.cc)::
+
+  const int Ls=8;
+
+  ////////////////////////////////////////////
+  // Grids distributed across full machine
+  // pick up default command line args
+  ////////////////////////////////////////////
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+                                                                   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  /////////////////////////////////////////////
+  // Split into N copies of 1^4 mpi communicators
+  /////////////////////////////////////////////
+  Coordinate mpi_split (mpi_layout.size(),1);
+  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
+						    GridDefaultSimd(Nd,vComplex::Nsimd()),
+						    mpi_split,
+						    *UGrid); 
+
+  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
+  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
+  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
+
+  ///////////////////////////////////////////////////////////////
+  // split the source out using MPI instead of I/O
+  ///////////////////////////////////////////////////////////////
+  Grid_split  (Umu,s_Umu);
+  Grid_split  (src,s_src);
+
+**Internals**
+
+The processor Grid is defined by data values in the Communicator object::
+
+  int              _Nprocessors;     // How many in all
+  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+  int              _processor;       // linear processor rank
+  std::vector<int> _processor_coor;  // linear processor coordinate
+  unsigned long    _ndimension;
+  Grid_MPI_Comm    communicator;
+
+The final of these is potentially an MPI Cartesian communicator, mapping some total number of processors
+to an N-dimensional coordinate system. This is used by Grid to geometrically decompose the subvolumes of a
+lattice field across processing elements. Grid is aware of multiple ranks per node and attempts to ensure
+that the geometrical decomposition keeps as many neigbours as possible on the same node. This is done
+by reordering the ranks in the constructor of a Communicator object once the topology requested has
+been indicated, via an internal call to the method OptimalCommunicator(). The reordering is chosen
+by Grid to trick MPI, which makes a simple lexicographic assignment of ranks to coordinate, to ensure
+that the simple lexicographic assignment of the reordered ranks is the optimal choice. MPI does not do this
+by default and substantial improvements arise from this design choice.
+
+Lattice containers
+-----------------------------------------
+
+Lattice objects may be constructed to contain the local portion of a distribued array of any tensor type.
+For performance reasons the tensor type uses a vector Real or Complex as the fundamental datum.
+
+Every lattice requires a GridBase object pointer to be provided in its constructor. Memory is allocated
+at construction time. If a Lattice is passed a RedBlack grid, it allocates
+half the storage of the full grid, and may either store the red or black checkerboard. The Lattice object
+will automatically track through assignments which checkerboard it refers to.
+For example, shifting a Even checkerboard by an odd distance produces an Odd result field.
+
+Struct of array objects are defined, and used in the template parameters to the lattice class.
+
+**Example** (lib/qcd/QCD.h)::
+
+       template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+       typedef iSpinMatrix<ComplexF>           SpinMatrixF; //scalar
+       typedef iSpinMatrix<vComplexF>          vSpinMatrixF;//vectorised
+       typedef Lattice<vSpinMatrixF>           LatticeSpinMatrixF;
+
+The full range of QCD relevant lattice objects is given below.
+
+=======  =======    ======  ======  ===========  =============================   =====================
+Lattice  Lorentz    Spin    Colour  scalar_type   Field                            Synonym
+=======  =======    ======  ======  ===========  =============================   =====================
+Vector   Scalar     Scalar  Scalar  Integer       LatticeInteger                 N/A
+Vector   Scalar     Scalar  Scalar  RealD         LatticeRealD                   N/A
+Vector   Scalar     Scalar  Scalar  ComplexD      LatticeComplexD                N/A
+Vector   Scalar     Scalar  Matrix  ComplexD      LatticeColourMatrixD           LatticeGaugeLink
+Vector   Vector     Scalar  Matrix  ComplexD      LatticeLorentzColourMatrixD    LatticeGaugeFieldD
+Vector   Scalar     Vector  Vector  ComplexD      LatticeSpinColourVectorD       LatticeFermionD
+Vector   Scalar     Vector  Vector  ComplexD      LatticeHalfSpinColourVectorD   LatticeHalfFermionD
+Vector   Scalar     Matrix  Matrix  ComplexD      LatticeSpinColourMatrixD       LatticePropagatorD
+=======  =======    ======  ======  ===========  =============================   =====================
+
+Additional single precison variants are defined with the suffix "F".
+Other lattice objects can be defined using the sort of typedef's shown above if needed.
+LatticeInteger is typically only used in the form of predicate fields for where statements.
+
+Opaque containers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The layout within the container is complicated to enable maximum opportunity for vectorisation, and 
+is opaque from the point of view of the API definition. The key implementation observation is that
+so long as data parallel operations are performed and adjacent SIMD lanes correspond to well separated
+lattice sites, then identical operations are performed on all SIMD lanes and enable good vectorisation.
+
+Because the layout is opaque, import and export routines from naturally ordered x,y,z,t arrays
+are provided (lib/lattice/Lattice_transfer.h)::
+
+    unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in);
+    vectorizeFromLexOrdArray(std::vector<sobj> &in , Lattice<vobj> &out);
+
+sobj and vobj should be a matching pair of scalar and vector objects of the same internal structure.
+The compiler will let you know with a long and verbose complaint if they are not.
+	  
+The Lexicographic order of data in the external vector fields is defined by (lib/util/Lexicographic.h)::
+
+    Lexicographic::IndexFromCoor(const Coordinate &lcoor, int &lex,Coordinate *local_dims);
+
+This ordering is :math:`x + L_x * y + L_x*L_y*z + L_x*L_y*L_z *t`
+
+Peek and poke routines are provided to perform single site operations. These operations are
+extremely low performance and are not intended for algorithm development or performance-critical code.
+
+The following are `collective` operations and involve communication between nodes. All nodes receive the same
+result by broadcast from the owning node::
+
+    void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site);
+    void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site);
+
+The following are executed independently by each node::
+
+    void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site);
+    void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site);
+
+Lattices of one tensor type may be transformed into lattices of another tensor type by
+peeking and poking specific indices in a data parallel manner::
+
+    template<int Index,class vobj> // Vector data parallel index peek
+    auto PeekIndex(const Lattice<vobj> &lhs,int i);
+
+    template<int Index,class vobj> // Matrix data parallel index peek
+    auto PeekIndex(const Lattice<vobj> &lhs,int i,int j);
+
+    template<int Index,class vobj>   // Vector poke
+    void PokeIndex(Lattice<vobj> &lhs,const Lattice<> & rhs,int i)
+
+    template<int Index,class vobj>   // Matrix poke
+    void PokeIndex(Lattice<vobj> &lhs,const Lattice<> & rhs,int i,int j)
+  
+The inconsistent capitalisation on the letter P is due to an obscure bug in g++ that has not to
+our knowledge been fixed in any version. The bug was reported in 2016.
+
+.. todo:: CD: Do you want to mention/expose PropToFerm and FermToProp?
+	  Are there other such convenience routines to make part of the API?
+
+
+Global Reduction operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Reduction operations for any lattice field are provided. The result is identical on each computing node
+that is part of the relevant Grid communicator::
+
+  template<class vobj> 
+  RealD norm2(const Lattice<vobj> &arg);
+
+  template<class vobj> 
+  ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right);
+
+  template<class vobj> 
+  vobj sum(const Lattice<vobj> &arg)
+
+Site local reduction operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Internal indices may be reduced, site by site, using the following routines::
+
+  template<class vobj>
+  auto localNorm2 (const Lattice<vobj> &rhs)
+
+  template<class vobj>
+  auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) 
+
+Outer product
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A site local outer product is defined::
+
+  template<class ll,class rr>
+  auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs)
+
+
+Slice operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Slice operations are defined to operate on one lower dimension than the full lattice. The omitted dimension
+is the parameter orthogdim::
+
+  template<class vobj> 
+  void sliceSum(const Lattice<vobj> &Data,
+                std::vector<typename vobj::scalar_object> &result,
+ 	        int orthogdim);
+
+  template<class vobj> 
+  void sliceInnerProductVector( std::vector<ComplexD> & result, 
+	                        const Lattice<vobj> &lhs,
+	 			const Lattice<vobj> &rhs,
+				int orthogdim); 
+
+  template<class vobj> 
+  void sliceNorm (std::vector<RealD> &sn,
+       		  const Lattice<vobj> &rhs,
+		  int orthogdim);
+
+Data parallel expression template engine
+------------------------------------------
+
+The standard arithmetic operators and some data parallel library functions are implemented site by site
+on lattice types. 
+
+Operations may only ever combine lattice objects that have been constructed from the **same** grid pointer.
+
+**Example**::
+
+    LatticeFermionD A(&grid);
+    LatticeFermionD B(&grid);
+    LatticeFermionD C(&grid);
+    
+    A = B - C;
+
+Such operations are said to be **conformable** and are the lattice are guaranteed to have the same dimensions
+and both MPI and SIMD decomposition because they are based on the same grid object. The conformability check
+is lightweight and simply requires the same grid pointers be passed to the lattice objects. The data members
+of the grid objects are not compared.
+
+Conformable lattice fields may be combined with appropriate scalar types in expressions. The implemented
+rules follow those already documented for the tensor types. 
+
+
+
+
+Unary operators and functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following sitewise unary operations are defined:
+
+=====================    ============================================
+Operation                 Description
+=====================    ============================================
+operator-                negate
+adj                      Hermitian conjugate
+conjugate                complex conjugate
+trace                    sitewise trace
+transpose                sitewise transpose
+Ta                       take traceles anti Hermitian part
+ProjectOnGroup           reunitarise or orthogonalise
+real                     take the real part
+imag                     take the imaginary part
+toReal                   demote complex to real
+toComplex                promote real to complex
+timesI                   elementwise +i mult (0 is not multiplied)
+timesMinusI              elementwise -i mult (0 is not multiplied)
+abs                      elementwise absolute value
+sqrt                        elementwise square root
+rsqrt                       elementwise reciprocal square root
+sin                         elementwise sine
+cos                         elementwise cosine
+asin                        elementwise inverse sine
+acos                        elementwise inverse cosine
+log                         elementwise logarithm
+exp                         elementwise exponentiation
+operator!                   Logical negation of integer field
+Not                         Logical negation of integer field
+=====================    ============================================
+
+
+
+The following sitewise applied functions with additional parameters are::
+
+  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y);
+
+  template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y);
+
+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y);
+
+  template<class obj> Lattice<obj> 
+  expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP);
+
+Binary operators
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following binary operators are defined::
+
+  operator+
+  operator-
+  operator*
+  operator/
+
+Logical are defined on LatticeInteger types::
+
+  operator&
+  operator|
+  operator&&
+  operator||
+
+
+Ternary operator, logical operations and where
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Within the data parallel level of the API the only way to perform operations
+that are differentiated between sites is use predicated execution.
+
+The predicate takes the form of a LatticeInteger which is confromable with both
+the iftrue and iffalse argument::
+
+  template<class vobj,class iobj> void where(const Lattice<iobj> &pred,
+                                                   Lattice<vobj> &iftrue,
+						   Lattice<vobj> &iffalse);
+
+This plays the data parallel analogue of the C++ ternary operator::
+
+     a == b ? c : d;
+
+In order to create the predicate in a coordinate dependent fashion it is often useful
+to use the lattice coordinates. 
+
+The LatticeCoordinate function::
+
+    template<class iobj> LatticeCoordinate(Lattice<iobj> &coor,int dir);
+
+fills an Integer field with the coordinate in the direction specified by "dir".
+A usage example is given
+
+**Example**::
+
+        int dir = 3;
+        int block = 4;
+        LatticeInteger coor(FineGrid);
+
+	LatticeCoordinate(coor,dir);
+	
+	result = where(mod(coor,block)==(block-1),x,z);
+
+This example takes result to be either "x" or "z" in a coordinate dependent way. 
+When third (z) lattice coordinate lies at the boundaries of a block size (periodic arithmetic).
+This example is lifted and paraphrased from code that (data parallel) evaluates matrix elements
+for a coarse representation of the Dirac operator in multigrid.
+	  
+Other usage cases of LatticeCoordinate include the generation of plane wave momentum phases.
+
+Site local fused operations
+------------------------------------------
+
+The biggest limitation of expression template engines is that the optimisation 
+visibility is a single assignment statement in the original source code.
+
+There is no scope for loop fusion between multiple statements.
+Multi-loop fusion gives scope for greater cache locality.
+
+Two primitives for hardware aware parallel loops are provided.
+These will operate directly on the site objects which are expanded by a factor
+of the vector length (in our struct of array datatypes). 
+
+Since the mapping of sites
+to data lanes is opaque, these vectorised loops
+are *only* appropriate for optimisation of site local operations.
+
+
+View objects
+^^^^^^^^^^^^^^
+
+Due to an obscure aspect of the way that Nvidia handle device C++11 lambda functions,
+it is necessary to disable the indexing of a Lattice object.
+
+Rather, a reference to a lattice object must be first obtained.
+
+The reference is copyable to a GPU, and is able to be indexed on either accelerator code,
+or by host code.
+
+In order to prevent people developing code that dereferences Lattice objects in a way that
+works on CPU compilation, but fails on GPU compilation, we have decided to remove the ability
+to index a lattice object on CPU code. 
+
+As a result of Nvidia's constraints, all accesses to lattice objects are required to be made
+through a View object. 
+
+In the following, the type is LatticeView<vobj>, however it is wise to use the C++11 auto keyword
+to avoid naming the type. See code examples below.
+
+
+thread_loops
+^^^^^^^^^^^^^^
+
+The first parallel primitive is the thread_loop
+
+**Example**::
+
+  LatticeField r(grid); 
+  LatticeField x(grid);
+  LatticeField p(grid); 
+  LatticeField mmp(grid);
+  auto r_v = r.View();  
+  auto x_v = x.View();
+  auto p_v = p.View(); 
+  auto mmp_v = mmp.View();
+  thread_loop(s , r_v, {
+    r_v[s] = r_v[s]   - a * mmp_v[s];
+    x_v[s] = x_v[s]   + a*p_v[s];
+    p_v[s] = p_v[s]*b + r_v[s];
+  });
+
+accelerator_loops
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The second parallel primitive is the "accelerator_loop".
+
+The thread loop runs on host processor cores only. If the enabled architecture is 
+VGPU, if Grid is configured with 
+
+	 --enable-simd=VGPU, 
+
+the acccelerator_loop may run on a GPU if present. On non-accelerated architectures,
+the accelerator_loop will simply run as a an OpenMP thread_loop.
+
+It is planned to support multiple forms of acccelerator in future, including OpenMP 5.0 offload,
+and possibly SyCL based offload.
+
+
+**Example**::
+
+  LatticeField r(grid); 
+  LatticeField x(grid);
+  LatticeField p(grid); 
+  LatticeField mmp(grid);
+  auto r_v = r.View();  
+  auto x_v = x.View();
+  auto p_v = p.View(); 
+  auto mmp_v = mmp.View();
+  accelerator_loop(s , r_v, {
+    r_v[s] = r_v[s]   - a * mmp_v[s];
+    x_v[s] = x_v[s]   + a*p_v[s];
+    p_v[s] = p_v[s]*b + r_v[s];
+  });
+
+
+
+Cshift 
+^^^^^^^^^^
+
+Site shifting operations are provided using the Cshift function::
+
+  template<class vobj> 
+  Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
+
+This shifts the whole vector by any distance shift in the appropriate dimension.
+
+For the avoidance of doubt on direction conventions,a positive shift moves the 
+lattice site :math:`x_\mu = 1` in the rhs to :math:`x_\mu = 0` in the result.
+
+**Example** (benchmarks/Benchmark_wilson.cc)::
+
+  { // Naive wilson implementation
+    ref = Zero();
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu,1);
+      {
+	auto ref_v = ref.View();
+	auto tmp_v = tmp.View();
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu,-1);
+      {
+	auto ref_v = ref.View();
+	auto tmp_v = tmp.View();
+	for(int i=0;i<ref_v.size();i++){
+	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	}
+      }
+    }
+  }
+
+
+
+
+Inter-grid transfer operations
+-----------------------------------------------------
+
+Transferring between different checkerboards of the same global lattice::
+
+  template<class vobj> 
+  void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full);
+
+  template<class vobj> 
+  void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half);
+
+These are used to set up Schur red-black decomposed solvers, for example.
+
+Multi-grid projection between a fine and coarse grid::
+
+ template<class vobj,class CComplex,int nbasis>
+ void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
+                   const             Lattice<vobj>   &fineData,
+                   const std::vector<Lattice<vobj> > &Basis);
+
+Multi-grid promotion to a finer grid::
+
+  template<class vobj,class CComplex,int nbasis>
+  void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
+                    Lattice<vobj>   &fineData,
+                    const std::vector<Lattice<vobj> > &Basis)
+
+
+Support for subblock Linear algebra::
+
+  template<class vobj,class CComplex>
+  void blockZAXPY(Lattice<vobj> &fineZ,
+                  const Lattice<CComplex> &coarseA,
+                  const Lattice<vobj> &fineX,
+                  const Lattice<vobj> &fineY)
+
+  template<class vobj,class CComplex>
+  void blockInnerProduct(Lattice<CComplex> &CoarseInner,
+                         const Lattice<vobj> &fineX,
+                         const Lattice<vobj> &fineY)
+
+  template<class vobj,class CComplex>
+  void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
+
+  template<class vobj>
+  void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
+
+  template<class vobj,class CComplex>
+  void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
+
+Conversion between different SIMD layouts::
+
+  template<class vobj,class vvobj>
+  void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
+
+Slices between grid of dimension N and grid of dimentions N+1::
+
+  template<class vobj>
+  void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
+
+  template<class vobj>
+  void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice, int orthog)
+
+Growing a lattice by a multiple factor, with periodic replication::
+
+  template<class vobj>
+  void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
+
+That latter is useful to, for example, pre-thermalise a smaller volume and then grow the volume in HMC.
+It was written while debugging G-parity boundary conditions.
+
+
+Random number generators
+=========================================
+
+Grid provides three configure time options for random the number generator engine.
+
+* sitmo
+* ranlux48
+* mt19937
+
+The selection is controlled by the --enable-rng=<option> flag.
+
+Sitmo is the default Grid RNG and is recommended. It is a hash based RNG that is cryptographically secure and has 
+
+#. passed the BigCrush tests
+
+#. can Skip forward an arbitrary distance (up to 2^256) in O(1) time
+
+We use Skip functionality to place each site in an independent well separated stream.
+The Skip was trivially parallelised, important in a many core node,
+and gives very low overhead parallel RNG initialisation.
+
+Our implementation of parallel RNG
+
+* Has passed the BigCrush tests **drawing once from each site RNG** in a round robin fashion.
+
+This test is applied in tests/testu01/Test_smallcrush.cc
+
+The interface is as follows::
+
+  class GridSerialRNG { 
+    GridSerialRNG();
+    void SeedFixedIntegers(const std::vector<int> &seeds);
+    void SeedUniqueString(const std::string &s);
+  }
+
+  class GridParallelRNG {
+    GridParallelRNG(GridBase *grid);
+    void SeedFixedIntegers(const std::vector<int> &seeds);
+    void SeedUniqueString(const std::string &s);
+  }
+
+* Seeding 
+
+The SeedUniqueString uses a 256bit SHA from the OpenSSL library to construct integer seeds.
+The reason for this is to enable reproducible seeding in the measurement sector of physics codes.
+For example, labelling a random drawn by a string representation the physics information, and the
+appending trajectory number will give a unique set of seeds for each measurement on each trajectory.
+This string based functionality is probably not expected to be used in a lattice evolution, except for
+possibly the initial state. Subsequent evolution should checkpoint and restore lattice RNG state using
+the interfaces below.
+
+These may be drawn as follows::
+
+  void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
+  void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
+
+  void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
+  void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
+
+* Serial RNG's are used to assign scalar fields. 
+
+* Parallel RNG's are used to assign lattice fields and must subdivide the field grid (need not be conformable).
+
+It is the API users responsibility to initialise, manage, save and restore these RNG state for their algorithm.
+In particular there is no single globally managed RNG state. 
+
+Input/Output routines are provided for saving and restoring RNG states.
+
+lib/parallelIO/BinaryIO.h::
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Read a RNG;  use IOobject and lexico map to an array of state 
+  ////////////////////////////////////////////////////////////////////////////
+  static void readRNG(GridSerialRNG &serial,
+			     GridParallelRNG &parallel,
+			     std::string file,
+			     Integer offset,
+			     uint32_t &nersc_csum,
+			     uint32_t &scidac_csuma,
+			     uint32_t &scidac_csumb)
+  ////////////////////////////////////////////////////////////////////////////
+  // Write a RNG; lexico map to an array of state and use IOobject
+  ////////////////////////////////////////////////////////////////////////////
+  static void writeRNG(GridSerialRNG &serial,
+			      GridParallelRNG &parallel,
+			      std::string file,
+			      Integer offset,
+			      uint32_t &nersc_csum,
+			      uint32_t &scidac_csuma,
+			      uint32_t &scidac_csumb)
+
+lib/parallelIO/NerscIO.h::
+
+  void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file);
+
+  void readRNG(GridSerialRNG &serial,
+	       GridParallelRNG &parallel,
+	       std::string file,
+   	       Integer offset,
+ 	       uint32_t &nersc_csum,
+	       uint32_t &scidac_csuma,
+	       uint32_t &scidac_csumb);
+
+**Example**::
+
+  NerscIO::writeRNGState(sRNG,pRNG,rfile);
+
+Input output facilities
+=========================================
+
+Grid introduces both high performance parallel I/O routines, making use of MPI-2 parallel I/O 
+internally, and also features for automatic serialisation of rich object types to various
+common formats. The SciDAC file formats are supported.
+
+Serialisation
+--------------
+
+Serialisable classes can be defined by
+
+* Deriving from Serializable
+
+* Declaring all class data members in the GRID_SERIALIZABLE_CLASS_MEMBERS macro
+
+An example is
+
+**Example**::
+
+  class myclass : public Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(
+       myclass,
+       myenum, e,
+       std::vector<myenum>, ve,
+       std::string, name,
+       int, x,
+       double, y,
+       bool , b,
+       std::vector<double>, array,
+       std::vector<std::vector<double> >, twodimarray,
+       std::vector<std::vector<std::vector<Complex> > >, cmplx3darray);
+   };
+
+
+
+We make use of variadic macros to automatically generate both the class declaration and appropriate
+I/O routines. A virtual Reader and Writer interface is used. Specific cases included in Grid are
+
+
+============   ============    ========
+Writer         Reader          Format
+============   ============    ========
+XmlWriter      XmlReader        XML
+BinaryWriter   BinaryReader     Binary
+TextWriter     TextReader       ASCII
+JSONWriter     JSONReader       json
+Hdf5Writer     Hdf5Reader       HDF5
+============   ============    ========
+
+
+Write interfaces, similar to the XML facilities in QDP++ are presented. However,
+the serialisation routines are automatically generated by the macro, and a virtual
+reader adn writer interface enables writing to any of a number of formats.
+
+**Example**::
+
+  class myclass : public Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(
+      myclass,
+      myenum, e,
+      std::vector<myenum>, ve,
+      std::string, name,
+      int, x,
+      double, y,
+      bool , b,
+      std::vector<double>, array,
+      std::vector<std::vector<double> >, twodimarray,
+      std::vector<std::vector<std::vector<Complex> > >, cmplx3darray);
+   };
+
+   myclass Instance;
+   { 
+      XmlWriter WR("bother.xml");
+      push(WR,"NestedDocumentExample");
+      write(WR,Instance);
+      pop(WR);
+   }
+
+
+Data parallel field IO
+-----------------------
+
+Support is provided to perform I/O operations for distributed 
+Lattice fields. 
+
+Binary, NERSC, ILDG, and SciDAC formats are supported.
+
+The ILDG and SciDAC formats require that Grid be compiled with the LIME library.
+
+Parallel I/O makes use of MPI-2 collective parallel I/O interfaces, and 
+relies on this to deliver good performance. 
+
+Binary Interface
+^^^^^^^^^^^^^^^^^^
+
+The Binary I/O interface defines the following API functions::
+
+  template<class vobj,class fobj,class munger>
+  static void BinaryIO::readLatticeObject(Lattice<vobj> &Umu,
+				       std::string file,
+				       munger munge,
+				       Integer offset,
+				       const std::string &format,
+				       uint32_t &nersc_csum,
+				       uint32_t &scidac_csuma,
+				       uint32_t &scidac_csumb)
+
+  template<class vobj,class fobj,class munger>
+  static void BinaryIO::writeLatticeObject(Lattice<vobj> &Umu,
+					std::string file,
+					munger munge,
+					Integer offset,
+					const std::string &format,
+					uint32_t &nersc_csum,
+					uint32_t &scidac_csuma,
+					uint32_t &scidac_csumb)
+
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Read a RNG;  use IOobject and lexico map to an array of state 
+  //////////////////////////////////////////////////////////////////////////////////////
+  static void BinaryIO::readRNG(GridSerialRNG &serial,
+			     GridParallelRNG &parallel,
+			     std::string file,
+			     Integer offset,
+			     uint32_t &nersc_csum,
+			     uint32_t &scidac_csuma,
+			     uint32_t &scidac_csumb)
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Write a RNG; lexico map to an array of state and use IOobject
+  //////////////////////////////////////////////////////////////////////////////////////
+  static void BinaryIO::writeRNG(GridSerialRNG &serial,
+			      GridParallelRNG &parallel,
+			      std::string file,
+			      Integer offset,
+			      uint32_t &nersc_csum,
+			      uint32_t &scidac_csuma,
+			      uint32_t &scidac_csumb)
+
+
+The `offset` parameters allow the other file formats, with merged `headers` of various formats
+to build upone the binary I/O facilities. 
+
+Thus the bulk of the data transferred is through a common
+code across all formats, and only the header generation differs. The Binary format has no headers,
+and it is the responsibility of the programmer to retain and check checksums.
+
+WE DO NOT RECOMMEND EVERY USING A BULK DATA FILE THAT HAS NOT HAD A CHECKSUM VERIFIED.
+
+Field meta data 
+^^^^^^^^^^^^^^^^^^^^
+
+in order to maximise code reuse, Grid uses an internal meta data field
+to represent gauge configuration in the API. This combines elements
+of the ILDG defined metadata and the NERSC header metadata::
+
+  class FieldMetaData : Serializable {
+  public:
+
+  GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
+				  int, nd,
+				  std::vector<int>, dimension,
+				  std::vector<std::string>, boundary,
+				  int, data_start,
+				  std::string, hdr_version,
+				  std::string, storage_format,
+				  double, link_trace,
+				  double, plaquette,
+				  uint32_t, checksum,
+				  uint32_t, scidac_checksuma,
+				  uint32_t, scidac_checksumb,
+				  unsigned int, sequence_number,
+				  std::string, data_type,
+				  std::string, ensemble_id,
+				  std::string, ensemble_label,
+				  std::string, ildg_lfn,
+				  std::string, creator,
+				  std::string, creator_hardware,
+				  std::string, creation_date,
+				  std::string, archive_date,
+				  std::string, floating_point);
+  };
+
+A routine::
+
+  void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
+
+are provided to automatically fill these data structures with the dimensions,
+the users name, machine name, dates and data types. The checksums are computed
+and returned by the I/O routines themselves.
+
+For gauge configurations::
+
+  void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header);
+
+Will compute the physical (plaquette, link trace) attributes stored.
+
+NERSC format and generalisations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The class NerscIO has static methods providing public methods for gauge and
+random number generator I/O as follows::
+
+  template<class vsimd>
+  static void NerscIO::readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+	 			FieldMetaData& header,
+				std::string file);
+
+  template<class vsimd>
+  static void NerscIO::writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
+			 		  std::string file, 
+					  int two_row,
+					  int bits32);
+
+  static void NerscIO::writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file);
+
+  static void NerscIO::readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,
+                                    FieldMetaData& header,std::string file);
+
+**Implementation detail**
+
+  The lattice field routines internally use the above Binary routines to write the bulk data at an offset
+  using MPI-2 I/O. 
+
+
+SciDAC and ILDG formats
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The routines in this section rely on the c-lime library being enabled in the Grid compilation
+(http://usqcd-software.github.io/c-lime/). The configure script searches for `lime`,
+but the location can be indicated with the 
+
+  `-\\-with-lime=prefix` 
+
+flag. 
+
+General writers into Lime record files are presented::
+
+  class GridLimeReader : public BinaryIO { 
+    void open(const std::string &_filename);
+    void close(void);
+    template<class vobj>
+    void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name);
+    template<class serialisable_object>
+    void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
+  };
+
+  class GridLimeWriter : public BinaryIO {
+    void open(const std::string &_filename);
+    void close(void);
+    int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize);
+    template<class vobj>
+    void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name);
+    template<class serialisable_object>
+    void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name);
+  };
+
+
+These are specialised to SciDAC writers, introducing facilities for generating type strings and checksum information::
+
+  template<class vobj> std::string 
+  ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount);
+
+  template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
+					 FieldMetaData &header,
+					 scidacRecord & _scidacRecord,
+					 scidacFile   & _scidacFile);
+
+
+  class ScidacWriter : public GridLimeWriter {
+    template<class SerialisableUserFile>
+    void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile);
+    template <class vobj, class userRecord>
+    void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord);
+  };
+
+  class ScidacReader : public GridLimeReader {
+    template<class SerialisableUserFile>
+    void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile);
+    template <class vobj, class userRecord>
+    void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord);
+  };
+
+They are also specialised to ILDG format writers, available and defined only for Gauge configurations::
+
+  class IldgWriter : public ScidacWriter {
+
+    template <class vsimd>
+    void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description);
+
+  };
+
+  class IldgReader : public GridLimeReader {
+    template <class vsimd>
+    void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) ;
+
+  };
+
+**Implementation detail**
+
+  The lattice field routines internally use the above Binary routines to write the bulk data at an offset
+  using MPI-2 I/O. The cooperation with c-lime is functional but inelegant, using `ftell` on the File
+  pointer that we provide to c-lime to find the current offset when we write the payload.
+
+**Example** (tests/IO/Test_ildg_io.cc)::
+  
+  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
+  GridParallelRNG   pRNG(&Fine); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  LatticeGaugeField Umu(&Fine);
+  SU<Nc>::HotConfiguration(pRNG,Umu);
+  FieldMetaData header;
+
+  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
+  std::cout <<GridLogMessage<<"** Writing out  ILDG conf    *********"<<std::endl;
+  std::cout <<GridLogMessage<<"**************************************"<<std::endl;
+  std::string file("./ckpoint_ildg.4000");
+  IldgWriter _IldgWriter;
+  _IldgWriter.open(file);
+  _IldgWriter.writeConfiguration(Umu,4000,std::string("dummy_ildg_LFN"),std::string("dummy_config"));
+  _IldgWriter.close();
+
+**Example** (tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc)::
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+    assert(this->_Aggregate.subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::ScidacWriter WR;
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+  }
+
+Linear operators
+=========================================
+
+As a basic interface to fermion Dirac operators, we define an abstract sparse matrix base class::
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // Interface defining what I expect of a general sparse matrix, such as a Fermion action
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class SparseMatrixBase {
+  public:
+    virtual GridBase *Grid(void) =0;
+    // Full checkerboar operations
+    virtual RealD M    (const Field &in, Field &out)=0;
+    virtual RealD Mdag (const Field &in, Field &out)=0;
+    virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
+      Field tmp (in.Grid());
+      ni=M(in,tmp);
+      no=Mdag(tmp,out);
+    }
+    virtual  void Mdiag    (const Field &in, Field &out)=0;
+    virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+  };
+
+And a derived class adding methods suitable to red black preconditioning::
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // Interface augmented by a red black sparse matrix, such as a Fermion action
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
+  public:
+    virtual GridBase *RedBlackGrid(void)=0;
+    // half checkerboard operaions
+    virtual  void Meooe    (const Field &in, Field &out)=0;
+    virtual  void Mooee    (const Field &in, Field &out)=0;
+    virtual  void MooeeInv (const Field &in, Field &out)=0;
+  
+    virtual  void MeooeDag    (const Field &in, Field &out)=0;
+    virtual  void MooeeDag    (const Field &in, Field &out)=0;
+    virtual  void MooeeInvDag (const Field &in, Field &out)=0;
+  };
+
+A checkerboard is defined as a parity :math:`(x+y+z+t)|2`, and half checker board operations supported
+for red black preconditioning.
+
+
+=============       ====================================================================
+Member                  Description
+=============       ====================================================================
+M                    Apply matrix
+Mdag                 Apply matrix adjoint
+MdagM                Apply Matrix then adjoin matrix
+Mdiag                Apply site diagonal part of matrix
+Mdir                 Apply terms involving hopping one direction
+Meooe                Apply even/odd matrix. ResultCB determined by InputCB
+Mooee                Apply site diagonal terms to a CB field
+MooeInv              Apply inverse of site diagonal terms to a CB field
+MeooeDag             Apply adjoint of Meooe
+MooeeDag             Apply adjoint of Mooee
+MooeeInvDag          Apply adjoint inverse of Mooee
+=============       ====================================================================
+
+All Fermion operators will derive from this base class.
+
+
+Linear Operators
+-------------------
+
+We introduce a second, decoupled class of LinearOperators. Linear operators may
+compose the elements of sparse in different ways.
+
+By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
+between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
+the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
+replication of code.
+
+
+**Abstract base**::
+
+  template<class Field> class LinearOperatorBase {
+  public:
+
+    // Support for coarsening to a multigrid
+    virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
+    virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+
+    virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
+    virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
+    virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
+    virtual void HermOp(const Field &in, Field &out)=0;
+  };
+
+==============           ==========================================================================
+Member                       Description
+==============           ==========================================================================
+OpDiag                    Diagonal term
+OpDir                     Terms involving hopping in one direction
+Op                        Full operator
+AdjOp                     Full operator adjoint
+HermOp                    A hermitian version of operator (possibly squared)
+HermOpAndNorm             A hermitian version of operator (possibly squared) returns norm result
+==============           ==========================================================================
+
+
+MdagMLinearOperator
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This Linear operator takes a SparseMatrix (Fermion operator) and implements the unpreconditioned
+MdagM operator with the above interface::
+
+  template<class Matrix,class Field>
+  class MdagMLinearOperator : public LinearOperatorBase<Field> {
+  public:
+    MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
+  ....
+  }
+
+ShiftedMdagMLinearOperator
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This Linear operator takes a SparseMatrix (Fermion operator) and implements the unpreconditioned
+`MdagM + shift` operator with the above interface::
+
+  template<class Matrix,class Field>
+  class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
+    ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+  };
+
+HermitianLinearOperator
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This Linear operator takes an already Hermitian SparseMatrix (Fermion operator) and implements the 
+`M` operator with the above interface::
+
+  template<class Matrix,class Field>
+  class HermitianLinearOperator : public LinearOperatorBase<Field> {
+    HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
+    void Op     (const Field &in, Field &out)   { _Mat.M(in,out);  }
+    void AdjOp     (const Field &in, Field &out){ _Mat.M(in,out);  }
+  }
+
+This operator is suitable for staggered Fermion inversions for example, or for :math:`\gamma_5 D_W` inversions.
+
+Red Black
+-----------
+
+We introduce a base operator for Schur decomposed red black solves::
+
+  template<class Field>
+  class SchurOperatorBase :  public LinearOperatorBase<Field> {
+  public:
+    virtual  RealD Mpc      (const Field &in, Field &out) =0;
+    virtual  RealD MpcDag   (const Field &in, Field &out) =0;
+    virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no);
+    virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2);
+    virtual void HermOp(const Field &in, Field &out);
+    void Op     (const Field &in, Field &out);
+    void AdjOp     (const Field &in, Field &out);
+    void OpDiag (const Field &in, Field &out) ;
+    void OpDir  (const Field &in, Field &out,int dir,int disp) ;
+  };
+
+Since there are a number of ways to perform Schur decomposition and solution there
+are a number of derived classes.::
+
+  template<class Matrix,class Field>
+  class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> {
+    SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
+  };
+
+  template<class Matrix,class Field>
+  class SchurDiagOneLH :  public SchurOperatorBase<Field> {
+    SchurDiagOneLH (Matrix &Mat): _Mat(Mat){};
+  };
+
+  template<class Matrix,class Field>
+  class SchurDiagOneRH :  public SchurOperatorBase<Field> {
+  SchurDiagOneRH (Matrix &Mat): _Mat(Mat){};
+  };
+
+For staggered fermions the Schur decomposition operator is::
+
+  template<class Matrix,class Field>
+  class SchurStaggeredOperator :  public SchurOperatorBase<Field> {
+  }
+
+The meaning of these different operators is
+
+=======================       ======================================================================================
+Operator                           Description
+=======================       ======================================================================================
+SchurDiagMooeeeOperator        :math:`M_{oo} + M_{oe} M_ee^{-1} M_{eo}`
+SchurDiagOneLH                 :math:`1 + M_{oo}^{-1}M_{oe} M_ee^{-1} M_{eo}`
+SchurDiagOneRH                 :math:`1 + M_{oe} M_ee^{-1} M_{eo}M_{oo}^{-1}`
+SchurStaggeredOperator
+=======================       ======================================================================================
+
+	  
+Operator Functions
+===================
+
+Can this be simplified ???
+
+I think a single class could do both OperatorFunction and LinearFunction roles
+Just need to pass the Operator in as a constructor???
+
+Audit this::
+
+
+  template<class Field> class OperatorFunction {
+    virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
+  };
+
+  template<class Field> class LinearFunction {
+    virtual void operator() (const Field &in, Field &out) = 0;
+  };
+
+  /////////////////////////////////////////////////////////////
+  // Base classes for Multishift solvers for operators
+  /////////////////////////////////////////////////////////////
+  template<class Field> class OperatorMultiFunction {
+    virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hermitian operator Linear function and operator function
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class Field>
+  class HermOpOperatorFunction : public OperatorFunction<Field> {
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out);
+  };
+
+  template<typename Field>
+  class PlainHermOp : public LinearFunction<Field> {
+    PlainHermOp(LinearOperatorBase<Field>& linop);
+  };
+
+  template<typename Field>
+  class FunctionHermOp : public LinearFunction<Field> {
+    FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop) ;
+  };
+
+
+Algorithms
+=========================================
+
+In this section we describe a number of algorithmic areas present in the core Grid library.
+
+* Approximation: Chebyshev and Rational
+* Krylov solvers: Conjugate Gradient
+* Eigensolver: Chebyshev preconditioned	Lanczos
+* FFT: multidimensional	 FFT of arbitrary lattice fields
+
+Approximation
+--------------
+
+Both Chebyshev and Rational approximation codes are included.
+
+
+Polynomial
+^^^^^^^^^^^^^
+
+
+A polynomial of an operator with a given set of coefficients can be applied::
+
+     template<class Field>                    
+     class Polynomial : public OperatorFunction<Field> { 
+         Polynomial(std::vector<RealD> &_Coeffs) ;       
+     };
+
+Chebyshev
+^^^^^^^^^^^
+
+Class::
+
+    template<class Field> class Chebyshev : public OperatorFunction<Field> 
+	
+Defines  constructors::
+
+    Chebyshev(ChebyParams p);
+    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) );
+    Chebyshev(RealD _lo,RealD _hi,int _order) ;
+
+and methods::
+
+    RealD approx(RealD x);
+
+    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) ;
+
+This will apply the appropriate polynomial of the LinearOperator Linop to the type Field.
+The coefficient for the standard Chebyshev approximation to an arbitrary function, over tha range
+[hi,lo] can be set up with the appropriate constructor call.
+
+Remez
+^^^^^^^^^^^
+
+We adopt the Remez class written by Kate Clark with minimal modifications for compatibility
+
+Class::
+
+   class AlgRemez
+
+
+Iterative solvers and algorithms
+-----------------------------------
+
+We document a number of iterative algorithms of topical relevance to Lattice Gauge theory.
+These are written for application to arbitrary fields and arbitrary operators using type
+templating, by implementating them as arbitrary OperatorFunctions.
+
+Most of these algorithms these algorithms operate on a generic matrix class, which
+derives from LinearOperatorBase.
+
+Linear operators
+^^^^^^^^^^^^^^^^^
+
+By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
+between RB and non-RB variants. Sparse matrix is an abstract fermion action def, and then
+the LinearOperator wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
+replication of code.
+
+algorithms/LinearOperator.h
+
+Class::
+
+    template<class Field> class LinearOperatorBase {
+    public:
+
+      // Support for coarsening to a multigrid
+      virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
+      virtual void OpDir  (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
+
+      virtual void Op     (const Field &in, Field &out) = 0; // Abstract base
+      virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
+      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
+      virtual void HermOp(const Field &in, Field &out)=0;
+    };
+
+The specific operators are:
+
+    template<class Matrix,class Field>  class MdagMLinearOperator : public LinearOperatorBase<Field> 
+    template<class Matrix,class Field>  class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> 
+    template<class Matrix,class Field>  class HermitianLinearOperator : public LinearOperatorBase<Field> 
+    template<class Field>               class SchurOperatorBase :  public LinearOperatorBase<Field> 
+    template<class Matrix,class Field>  class SchurDiagOneRH :  public SchurOperatorBase<Field> 
+    template<class Matrix,class Field>  class SchurDiagOneLH :  public SchurOperatorBase<Field> 
+    template<class Matrix,class Field>  class SchurStaggeredOperator :  public SchurOperatorBase<Field> 
+
+
+Conjugate Gradient 
+^^^^^^^^^^^^^^^^^^^
+
+algorithms/iterative/ConjugateGradient.h
+
+Class::
+
+	   template <class Field>  class ConjugateGradient : public OperatorFunction<Field> 
+
+with methods::
+
+	   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true);
+
+           void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) ;
+
+Multishift Conjugate Gradient 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+algorithms/iterative/ConjugateGradientMultiShift.h
+
+Class::  template<class Field> class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>, public OperatorFunction<Field>
+
+with methods::
+
+       ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true);
+
+       void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) ;
+
+
+Block Conjugate Gradient 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+algorithms/iterative/BlockConjugateGradient.h
+
+Class::
+
+	 template <class Field> class BlockConjugateGradient : public OperatorFunction<Field> 
+
+Several options are possible. The behaviour is controlled by an enumeration.
+
+Enum:: 
+
+         enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+
+Constructor::
+
+	 BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+
+With operator::
+
+        void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+
+
+* CGmultiRHS
+
+
+This applies conjugate gradient to multiple right hand sides concurrently making
+use of a separate Krylov space for each. There is no cross coupling and
+the routine is equivalent to running each of these independently one after the other
+in term of iteration count.
+
+*  BlockCGrQ
+
+This applies block conjugate gradient to multiple right hand sides concurrently making
+use of a shared Krylov space for each. The cross coupling may in some cases lead to
+acceleration of convergence and reduced matrix multiplies for multiple solves.
+
+Mixed precision Conjugate Gradient 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Class::
+
+    template<class FieldD,class FieldF>  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> 
+
+
+Applies an inner outer mixed precision Conjagate Gradient. It has constructor::
+
+    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
+
+
+Where the linear operators are for the single precision and double precision operators respectively.
+The operator to apply the inversion is::
+
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+
+
+
+Preconditioned Conjugate Residual
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Class::
+
+    template<class Field> class PrecConjugateResidual : public OperatorFunction<Field> 
+
+Constructor::
+
+    PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec)
+
+
+Solve method::
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi)
+
+
+Implicitly restarted Lanczos
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Class::
+
+      template<class Field> class ImplicitlyRestartedLanczos 
+
+Solve method::
+
+      void calc(std::vector<RealD>& eval, std::vector<Field>& evec,  const Field& src, int& Nconv, bool reverse=false)
+
+
+Schur decomposition
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=======================       ======================================================================================
+Operator                           Description
+=======================       ======================================================================================
+SchurDiagMooeeeOperator        :math:`M_{oo} + M_{oe} M_ee^{-1} M_{eo}`
+SchurDiagOneLH                 :math:`1 + M_{oo}^{-1}M_{oe} M_ee^{-1} M_{eo}`
+SchurDiagOneRH                 :math:`1 + M_{oe} M_ee^{-1} M_{eo}M_{oo}^{-1}`
+SchurStaggeredOperator         :math:`m^2 - M_{oe} M_{eo}`
+=======================       ======================================================================================
+
+Associated with these operators are convenience wrappers for Schur 
+decomposed solution of the full system are provided (red-black preconditioning, algorithms/iterative/SchurRedBlack.h):
+
+Class:: 
+
+	  template<class Field> class SchurRedBlackStaggeredSolve 
+	  template<class Field> class SchurRedBlackDiagMooeeSolve
+	  template<class Field> class SchurRedBlackDiagOneLHSolve
+	  template<class Field> class SchurRedBlackDiagOneRHSolve
+
+Constructors::
+
+	  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver);
+          SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0);
+          SchurRedBlackDiagOneLHSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0);
+          SchurRedBlackDiagOneRHSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0);
+
+The cb parameter specifies which checkerboard the SchurDecomposition factorises around, and the
+HermitianRBSolver parameter is an algorithm class, such as conjugate gradients, for solving the
+system of equations on a single checkerboard.
+
+
+All have the operator method, returning both checkerboard solutions::
+
+        template<class Matrix> void operator() (Matrix & _Matrix,const Field &in, Field &out)
+
+In order to allow for deflation of the preconditioned system, and external guess constructor is possible::
+
+	template<class Matrix, class Guesser> void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
+
+Lattice Gauge theory utilities
+=========================================
+
+
+Spin
+--------
+
+See, for example, tests/core/Test_gamma.cc for a complete self test of the 
+Grid Dirac algebra, spin projectors, and Gamma multiplication table.
+
+The spin basis is:
+
+.. math:: \gamma_x= \left(\begin{array}{cccc}    0& 0& 0& i\\  0& 0& i& 0\\  0&-i& 0& 0\\ -i& 0& 0& 0 \end{array}\right)
+
+.. math:: \gamma_y= \left(\begin{array}{cccc}    0& 0& 0&-1\\  0& 0& 1& 0\\  0& 1& 0& 0\\ -1& 0& 0& 0 \end{array}\right)
+
+.. math:: \gamma_z= \left(\begin{array}{cccc}    0& 0& i& 0\\  0& 0& 0&-i\\ -i& 0& 0& 0\\  0& i& 0& 0 \end{array}\right)
+
+.. math:: \gamma_t= \left(\begin{array}{cccc}    0& 0& 1& 0\\  0& 0& 0& 1\\  1& 0& 0& 0\\  0& 1& 0& 0 \end{array}\right)
+
+.. math:: \gamma_5= \left(\begin{array}{cccc}    1& 0& 0& 0\\  0& 1& 0& 0\\  0& 0&-1 &0\\  0& 0& 0&-1 \end{array}\right)
+
+These can be accessed via a strongly typed enumeration to avoid multiplication by zeros.
+The values are considered opaque, and symbolic names must be used.
+These are signed (prefixes like MinusIdentity also work)::
+
+      Gamma::Algebra::Identity
+      Gamma::Algebra::Gamma5
+      Gamma::Algebra::GammaT
+      Gamma::Algebra::GammaTGamma5
+      Gamma::Algebra::GammaX
+      Gamma::Algebra::GammaXGamma5
+      Gamma::Algebra::GammaY
+      Gamma::Algebra::GammaYGamma5
+      Gamma::Algebra::GammaZ
+      Gamma::Algebra::GammaZGamma5
+      Gamma::Algebra::SigmaXT
+      Gamma::Algebra::SigmaXZ
+      Gamma::Algebra::SigmaXY
+      Gamma::Algebra::SigmaYT
+      Gamma::Algebra::SigmaYZ
+      Gamma::Algebra::SigmaZT
+
+**Example** 
+
+They can be used, for example (benchmarks/Benchmark_wilson.cc)::
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+  { // Naive wilson implementation
+    ref = zero;
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu,1);
+      for(int i=0;i<ref._odata.size();i++){
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu,-1);
+      for(int i=0;i<ref._odata.size();i++){
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+      }
+    }
+  }
+  ref = -0.5*ref;
+  RealD mass=0.1;
+
+Two spin projection is also possible on non-lattice fields, and used to build high performance routines
+such as the Wilson kernel::
+
+  template<class vtype>  void spProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjTp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+  template<class vtype>  void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
+
+and there are associated reconstruction routines for assembling four spinors from these two spinors::
+
+  template<class vtype>  void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+
+  template<class vtype>  void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+  template<class vtype>  void accumRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
+
+These ca
+
+
+SU(N)
+--------
+
+A generic Nc qcd/utils/SUn.h is provided. This defines a template class::
+
+  template <int ncolour> class SU ;
+
+The most important external methods are::
+
+  static void printGenerators(void) ;
+  template <class cplx>  static void generator(int lieIndex, iSUnMatrix<cplx> &ta) ;
+
+  static void SubGroupHeatBath(GridSerialRNG &sRNG, GridParallelRNG &pRNG, RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
+                               LatticeMatrix &link,
+			       const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
+			       int su2_subgroup, int nheatbath, LatticeInteger &wheremask);
+
+  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG,
+                                                  LatticeMatrix &out,
+                                                  Real scale = 1.0) ;
+  static void GaugeTransform( GaugeField &Umu, GaugeMat &g)
+  static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g);
+
+  static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) ;
+  static void TepidConfiguration(GridParallelRNG &pRNG,GaugeField &out);
+  static void ColdConfiguration(GaugeField &out);
+
+  static void taProj( const LatticeMatrixType &in,  LatticeMatrixType &out);
+  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) ;
+
+  static int su2subgroups(void) ; // returns how many subgroups
+
+
+Specific instantiations are defined::
+
+	 typedef SU<2> SU2;
+	 typedef SU<3> SU3;
+	 typedef SU<4> SU4;
+	 typedef SU<5> SU5;
+
+For example, Quenched QCD updating may be run as (tests/core/Test_quenched_update.cc)::
+
+  for(int sweep=0;sweep<1000;sweep++){
+
+    RealD plaq = ColourWilsonLoops::avgPlaquette(Umu);
+
+    std::cout<<GridLogMessage<<"sweep "<<sweep<<" PLAQUETTE "<<plaq<<std::endl;
+
+    for( int cb=0;cb<2;cb++ ) {
+
+      one.checkerboard=subsets[cb];
+      mask= zero;
+      setCheckerboard(mask,one);
+
+      //      std::cout<<GridLogMessage<<mask<<std::endl;
+      for(int mu=0;mu<Nd;mu++){
+	
+	// Get Link and Staple term in action; must contain Beta and 
+	// any other coeffs
+	ColourWilsonLoops::Staple(staple,Umu,mu);
+
+	link = PeekIndex<LorentzIndex>(Umu,mu);
+
+	for( int subgroup=0;subgroup<SU3::su2subgroups();subgroup++ ) {
+
+	  // update Even checkerboard
+	  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask);
+
+	}
+
+	PokeIndex<LorentzIndex>(Umu,link,mu);
+	
+	//reunitarise link;
+	ProjectOnGroup(Umu);
+      }
+    }
+  }
+
+
+Space time grids
+----------------
+
+
+
+Lattice actions
+=========================================
+
+We discuss in some detail the implementation of the lattice actions.
+The action is a sum of terms, each of which must inherit from and provide the following interface.
+
+lib/qcd/action/ActionBase.h::
+
+  class Action 
+  {
+
+   public:
+    bool is_smeared = false;
+    // Heatbath?
+    virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
+    virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
+    virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
+    virtual std::string action_name()    = 0;                             // return the action name
+    virtual std::string LogParameters()  = 0;                             // prints action parameters
+    virtual ~Action(){}
+  };
+
+Fermion Lattice actions are defined in the qcd/action/fermion subdirectory and  in the
+qcd/action/gauge subdirectories. For Hybrid Monte Carlo and derivative sampling algorithm
+Pseudofermoin actions are defined in the qcd/action/pseudofermion subdirectory.
+
+The simplest lattice action is the Wilson plaquette action, and we start by considering the Wilson loops
+facilities as this is illustrative of the implementation policy design approach.
+
+Wilson loops
+--------------
+
+Wilson loops are common objects in Lattice Gauge Theory.
+A utility class is provided to assist assembling these as they occur both in common observable construction but
+also in actions such as the Wilson plaquette and the rectangle actions. 
+
+Since derivatives with respect to gauge links are required for evolution codes, non-closed staples of 
+various types are also provided. The gauge actions are all assembled consistently from the Wilson loops
+class.
+
+**Implementation policies**
+
+The Wilson loops class is templated to take a implementation policy class parameter. The covarian Cshift is inherity from
+the policy and implements boundary conditions, such as period, anti-period or charge conjugate. In this
+way the Wilson loop code can automatically transform with the boundary condition and give the right plaquette,
+force terms etc... for the boundary conditions passed in external to the class. 
+
+
+This implementation policy class is called an "impl", and a class that bundles together all the required
+rules to assemble a gauge action is called a Gimpl.
+
+There are several facilities provided by a Gimpl.
+
+These include Boundary conditions and consequently a CovariantCshift.
+
+CovariantCshift 
+^^^^^^^^^^^^^^^^^^
+
+Covariant Cshift operations are provided for common cases of the boundary condition. These may be further optimised
+in future::
+
+  template<class covariant,class gauge> 
+  Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, int mu,
+			   	     const Lattice<covariant> &field);
+
+  template<class covariant,class gauge> 
+  Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link, int mu,
+			              const Lattice<covariant> &field);
+
+Boundary conditions
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The covariant shift routines occur in namespaces PeriodicBC and ConjugateBC. The correct covariant shift
+for the boundary condition is passed into the gauge actions and wilson loops via an
+"Impl" template policy class.
+
+The relevant staples, plaquettes, and loops are formed by using the provided method::
+
+    Impl::CovShiftForward
+    Impl::CovShiftBackward
+
+etc... This makes physics code transform appropriately with externally supplied rules about
+treating the boundary.
+
+**Example** (lib/qcd/util/WilsonLoops.h)::
+
+  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
+                           const int mu, const int nu) {
+    // ___
+    //|   |
+    //|<__|
+    plaq = Gimpl::CovShiftForward(U[mu],mu,
+                    Gimpl::CovShiftForward(U[nu],nu,
+			Gimpl::CovShiftBackward(U[mu],mu,
+			   Gimpl::CovShiftIdentityBackward(U[nu], nu))));
+  }
+
+Currently provided predefined implementations are (qcd/action/gauge/GaugeImplementations.h)::
+
+	  typedef PeriodicGaugeImpl<GimplTypesR> PeriodicGimplR; // Real.. whichever prec
+	  typedef PeriodicGaugeImpl<GimplTypesF> PeriodicGimplF; // Float
+	  typedef PeriodicGaugeImpl<GimplTypesD> PeriodicGimplD; // Double
+
+	  typedef PeriodicGaugeImpl<GimplAdjointTypesR> PeriodicGimplAdjR; // Real.. whichever prec
+	  typedef PeriodicGaugeImpl<GimplAdjointTypesF> PeriodicGimplAdjF; // Float
+	  typedef PeriodicGaugeImpl<GimplAdjointTypesD> PeriodicGimplAdjD; // Double
+
+	  typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever prec
+	  typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
+	  typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
+
+Gauge Actions
+---------------
+
+lib/qcd/action/gauge/Photon.h defines the U(1) field::
+
+     class Photon
+
+using Fourier techniques.
+
+lib/qcd/action/gauge/WilsonGaugeAction.h defines the standard plaquette action::
+
+  template <class Gimpl>
+  class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> ;
+
+This action is suitable to use in a Hybrid Monte Carlo evolution as an action term and has constructor::
+
+   WilsonGaugeAction(RealD beta_);
+
+lib/qcd/action/gauge/PlaqPlusRectangleAction.h defines the standard plaquette plus rectangle class of action::
+
+  template<class Gimpl>
+  class PlaqPlusRectangleAction : public Action<typename Gimpl::GaugeField> ;
+
+The constructor is::
+
+   PlaqPlusRectangleAction(RealD b,RealD c);
+
+Due to varying conventions, convenience wrappers are provided::
+
+    template<class Gimpl>    class RBCGaugeAction : public PlaqPlusRectangleAction<Gimpl>;
+    template<class Gimpl>    class IwasakiGaugeAction : public RBCGaugeAction<Gimpl> ;
+    template<class Gimpl>    class SymanzikGaugeAction : public RBCGaugeAction<Gimpl> ;
+    template<class Gimpl>    class DBW2GaugeAction : public RBCGaugeAction<Gimpl> ;
+
+With convenience constructors to set the rectangle coefficient automatically to popular values::
+
+      SymanzikGaugeAction(RealD beta) : RBCGaugeAction<Gimpl>(beta,-1.0/12.0) {};
+      IwasakiGaugeAction(RealD beta) : RBCGaugeAction<Gimpl>(beta,-0.331) {};
+      DBW2GaugeAction(RealD beta) : RBCGaugeAction<Gimpl>(beta,-1.4067) {};
+
+
+Fermion
+--------
+
+These classes all make use of a Fermion Implementation (Fimpl) policy class to provide
+things like boundary conditions, covariant transportation rules etc.
+
+All Fermion operators actions inherit from a common base class, 
+
+that conforms to the CheckerBoardedSparseMatrix interface and constrains these objects to conform to the
+interface expected by general algorithms in Grid::
+
+    /////////////////////////////////////////////////////////////////////////////////////////////
+    // Interface defining what I expect of a general sparse matrix, such as a Fermion action
+    /////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Field> class SparseMatrixBase {
+    public:
+      virtual GridBase *Grid(void) =0;
+      // Full checkerboar operations
+      virtual RealD M    (const Field &in, Field &out)=0;
+      virtual RealD Mdag (const Field &in, Field &out)=0;
+      virtual void  MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
+        Field tmp (in._grid);
+        ni=M(in,tmp);
+        no=Mdag(tmp,out);
+      }
+      virtual  void Mdiag    (const Field &in, Field &out)=0;
+      virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
+    };
+
+    /////////////////////////////////////////////////////////////////////////////////////////////
+    // Interface augmented by a red black sparse matrix, such as a Fermion action
+    /////////////////////////////////////////////////////////////////////////////////////////////
+    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
+    public:
+      virtual GridBase *RedBlackGrid(void)=0;
+      // half checkerboard operaions
+      virtual  void Meooe    (const Field &in, Field &out)=0;
+      virtual  void Mooee    (const Field &in, Field &out)=0;
+      virtual  void MooeeInv (const Field &in, Field &out)=0;
+
+      virtual  void MeooeDag    (const Field &in, Field &out)=0;
+      virtual  void MooeeDag    (const Field &in, Field &out)=0;
+      virtual  void MooeeInvDag (const Field &in, Field &out)=0;
+
+    };
+
+The base class for Fermion Operators inherits frmo these::
+
+    template<class Impl>
+    class FermionOperator : public CheckerBoardedSparseMatrixBase<typename Impl::FermionField>, public Impl
+
+These all make use of an implementation template class, and the possible implementations include::
+
+  typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF;  // Float
+  typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD;  // Double
+
+Staggered fermions make us of a spin index free field via the StaggeredImpl::
+
+  typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
+  typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD;  // Double
+
+A number of alternate, non-fundamental Fermion representations are supported. Note that the Fermion
+action code is common to each of these, demonstrating the utility of the template Fimpl classes for separating
+the code that varies from the invariant sections::
+
+  typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF;  // Float
+  typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD;  // Double
+
+  typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
+  typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
+ 
+  typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
+  typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
+
+G-parity boundary conditions are supported, and an additional flavour index inserted on the Fermion field via the Gparity implementation::
+
+  typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
+  typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
+  
+ZMobius Fermions use complex rather than real action coefficients and are supported via an alternate implementation::
+
+  typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
+  typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
+
+
+Some example constructor calls are given below for Wilson and Clover fermions::
+
+    template <class Impl> class WilsonFermion;
+
+With constructor::
+
+    WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                  GridRedBlackCartesian &Hgrid, RealD _mass, 
+		  const ImplParams &p = ImplParams(), 
+                  const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
+
+and::
+
+    template <class Impl> class WilsonCloverFermion : public WilsonFermion<Impl>;
+
+with constructor::
+
+    WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
+                        GridRedBlackCartesian &Hgrid,
+                        const RealD _mass,
+                        const RealD _csw_r = 0.0,
+                        const RealD _csw_t = 0.0,
+                        const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
+                        const ImplParams &impl_p = ImplParams());
+
+Additional paramters allow for anisotropic versions to be created, which take default values for
+the isotropic case.
+
+The constuctor signatures can be found in the header files in qcd/action/fermion/
+A complete list of the 4D ultralocal Fermion types is::
+
+    WilsonFermion<WilsonImplF>                            WilsonFermionF;
+    WilsonFermion<WilsonAdjImplF>                         WilsonAdjFermionF;
+    WilsonFermion<WilsonTwoIndexSymmetricImplF>           WilsonTwoIndexSymmetricFermionF;
+    WilsonFermion<WilsonTwoIndexAntiSymmetricImplF>       WilsonTwoIndexAntiSymmetricFermionF;
+    WilsonTMFermion<WilsonImplF>                          WilsonTMFermionF;
+    WilsonCloverFermion<WilsonImplF>                      WilsonCloverFermionF;
+    WilsonCloverFermion<WilsonAdjImplF>                   WilsonCloverAdjFermionF;
+    WilsonCloverFermion<WilsonTwoIndexSymmetricImplF>     WilsonCloverTwoIndexSymmetricFermionF;
+    WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
+    ImprovedStaggeredFermion<StaggeredImplF>              ImprovedStaggeredFermionF;
+
+Cayley form chiral fermions (incl. domain wall)::
+
+    DomainWallFermion<WilsonImplF>                   DomainWallFermionF;
+    DomainWallEOFAFermion<WilsonImplF>               DomainWallEOFAFermionF;
+    MobiusFermion<WilsonImplF>                       MobiusFermionF;
+    MobiusEOFAFermion<WilsonImplF>                   MobiusEOFAFermionF;
+    ZMobiusFermion<ZWilsonImplF>                     ZMobiusFermionF;
+    ScaledShamirFermion<WilsonImplF>                 ScaledShamirFermionF;
+    MobiusZolotarevFermion<WilsonImplF>              MobiusZolotarevFermionF;
+    ShamirZolotarevFermion<WilsonImplF>              ShamirZolotarevFermionF;
+    OverlapWilsonCayleyTanhFermion<WilsonImplF>      OverlapWilsonCayleyTanhFermionF;
+    OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
+
+Continued fraction overlap::
+
+    OverlapWilsonContFracTanhFermion<WilsonImplF>      OverlapWilsonContFracTanhFermionF;
+    OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
+
+ Partial fraction overlap::
+
+    OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
+    OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
+
+  Gparity cases; a partial list is defined until tested::
+
+    WilsonFermion<GparityWilsonImplF>         GparityWilsonFermionF;
+    DomainWallFermion<GparityWilsonImplF>     GparityDomainWallFermionF;
+    DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
+
+    WilsonTMFermion<GparityWilsonImplF>       GparityWilsonTMFermionF;
+    MobiusFermion<GparityWilsonImplF>         GparityMobiusFermionF;
+    MobiusEOFAFermion<GparityWilsonImplF>     GparityMobiusEOFAFermionF;
+
+For each action, the suffix "F" can be replaced with "D" to obtain a double precision version. More generally,
+it is possible to perform communications with a different precision from computation.
+The number of combinations is rather large to list, but in the above listing the substitution is the
+obvious one.
+
+==========   ================  ==================
+Suffix        Computation       Communication
+==========   ================  ==================
+F                fp32               fp32
+D                fp64               fp64
+R                default            default
+FH               fp32               fp16
+DF               fp64               fp32
+RL               default            lower 
+==========   ================  ==================
+
+
+Pseudofermion
+---------------
+
+Pseudofermion actions are defined in  qcd/action/pseudofermion/ .
+These action terms are built from template classes::
+
+    // Base even odd HMC on the normal Mee based schur decomposition.
+    //
+    //     M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
+    //         (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
+    //
+    // Determinant is det of middle factor. This assumes Mee is indept of U.
+    template<class Impl> class SchurDifferentiableOperator ;
+
+    // S = phi^dag (Mdag M)^-1 phi
+    template <class Impl>  class TwoFlavourPseudoFermionAction ;
+
+    // S = phi^dag V (Mdag M)^-1 Vdag phi
+    template<class Impl>   class TwoFlavourRatioPseudoFermionAction ;
+
+    // S = phi^dag (Mdag M)^-1 phi  (odd)
+    //   + phi^dag (Mdag M)^-1 phi  (even)
+    template <class Impl>  class TwoFlavourEvenOddPseudoFermionAction;
+
+    // S = phi^dag V (Mdag M)^-1 Vdag phi
+    template <class Impl>  class TwoFlavourEvenOddRatioPseudoFermionAction ;
+
+
+Rational HMC pseudofermion terms::
+
+    // S_f = chi^dag *  N(M^dag*M)/D(M^dag*M) * chi
+    //
+    // Here, M is some operator 
+    // N and D makeup the rat. poly 
+    template<class Impl> class OneFlavourRationalPseudoFermionAction;
+
+    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    //
+    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
+    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
+    template<class Impl> class OneFlavourRatioRationalPseudoFermionAction;
+
+
+    // S = phi^dag (Mdag M)^-1/2 phi
+    template <class Impl> class OneFlavourEvenOddRationalPseudoFermionAction;
+
+    // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
+    //
+    // Here P/Q \sim R_{1/4}  ~ (V^dagV)^{1/4}  
+    // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2}  
+    template<class Impl> class OneFlavourEvenOddRatioRationalPseudoFermionAction;
+
+The relevant Fermion operators are constructed externally,
+and references are passed in to these object constructors. Thus, they work for any Fermion operator and the code
+can be shared. For example, one of the constructors is given as::
+
+      TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+                                                FermionOperator<Impl>  &_DenOp, 
+                                                OperatorFunction<FermionField> & DS,
+                                                OperatorFunction<FermionField> & AS) :
+
+
+The exact one flavour algorithm for Domain Wall Fermions is present but is not documented here::
+
+      #include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
+
+HMC
+=========================================
+
+There are a large number of examples under tests/hmc/
+
+The most important data structure associated with (R)HMC describes the action
+and integration scheme.
+
+The action is a sum of terms, possibly with nested timesteps.
+
+This is assembled in an ActionSet object (qcd/action/ActionSet.h).
+The timesteps are managed by Levels. The ActionSet object maintains a list
+of ActionLevel objects::
+
+  // Define the ActionSet
+  template <class GaugeField, class R>
+  using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
+
+Each ActionLevel is associated with each level of the
+nested integration scheme, schematically::
+
+  template <class Field>
+  struct ActionLevel {
+  public:
+    unsigned int multiplier;
+    // Fundamental repr actions separated because of the smearing
+    typedef Action<Field>* ActPtr;
+    std::vector<ActPtr>& actions;
+  }
+
+The outer loop, running MD trajectories, Metropolis steps, saving and restoring configurations
+is generic and managed by a "Runner" class..
+
+There are a number of possible integrators: LeapFrog, MinimumNorm2, ForceGradient
+
+These are a template parameter to the HMCRunner class. We will take as an example Test_hmc_EODWFRatio.cc::
+
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
+
+The test defines the Fermion action and Gauge action
+
+  typedef WilsonImplR FermionImplPolicy;
+  typedef DomainWallFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+  HMCWrapper TheHMC;
+
+The HMC runner is given the Grid information (lifted from standard Grid --grid Lx.Ly.Lz.Lt command line)::
+
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+The checkpointing strategy is defined::
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams;  
+  CPparams.config_prefix = "ckpoint_EODWF_lat";
+  CPparams.rng_prefix = "ckpoint_EODWF_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+.. todo::  HOW TO resume from saved RNGs. Guido changed this.
+
+Random number generators are seeded::
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+Observables measured at the end of the trajectory can be registered::
+
+  // Construct observables
+  // here there is too much indirection 
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+The action must be defined::
+
+  RealD beta = 5.6 ;
+  WilsonGaugeActionR Waction(beta);
+
+  const int Ls = 8;
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+  auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
+  auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
+
+  LatticeGaugeField U(GridPtr);
+
+  Real mass = 0.04;
+  Real pv   = 1.0;
+  RealD M5  = 1.5;
+
+  FermionAction DenOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,mass,M5);
+  FermionAction NumOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv,  M5);
+
+  double StoppingCondition = 1.0e-8;
+  double MaxCGIterations = 10000;
+  ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
+  TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> Nf2(NumOp, DenOp,CG,CG);
+
+    // Set smearing (true/false), default: false
+  Nf2.is_smeared = false;
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+
+And the HMC can be setup and run::
+
+  TheHMC.Parameters.MD.MDsteps = 20;
+  TheHMC.Parameters.MD.trajL   = 1.0;
+  TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+
+  TheHMC.Run();  // no smearing
+
+This puts together the pieces of the previous sections (actions, Fermion operators, solver algorithms etc...) into a full application.
+
+Development of the internals
+========================================
+
+.. todo:: CD: The whole section needs to be completed, of course
+	  
+The interfaces used in this chapter of the manual are subject 
+to change without notice as new architectures are addressed.
+
+The intent is to document the approach taken by Grid to optimised
+code, and provide guidelines to those developers working on the internals.
+
+Simd classes
+---------------------------------------------
+
+Communications facilities
+---------------------------------------------
+
+Cartesian Grid facilities and field layout
+---------------------------------------------
+
+Stencil construction
+---------------------------------------------
+
+Optimised fermion operators
+---------------------------------------------
+
+Optimised communications
+---------------------------------------------
+
+
+.. include:: interfacing.rst
+	  
+
+.. image:: logo.png
+   :width: 200px
+   :align: center
+
diff --git a/extras/Hadrons/Environment.hpp b/extras/Hadrons/Environment.hpp
deleted file mode 100644
index 7f1bc26d..00000000
--- a/extras/Hadrons/Environment.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: extras/Hadrons/Environment.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Environment_hpp_
-#define Hadrons_Environment_hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         Global environment                                 *
- ******************************************************************************/
-class Object
-{
-public:
-    Object(void) = default;
-    virtual ~Object(void) = default;
-};
-
-template <typename T>
-class Holder: public Object
-{
-public:
-    Holder(void) = default;
-    Holder(T *pt);
-    virtual ~Holder(void) = default;
-    T &       get(void) const;
-    T *       getPt(void) const;
-    void      reset(T *pt);
-private:
-    std::unique_ptr<T> objPt_{nullptr};
-};
-
-#define DEFINE_ENV_ALIAS \
-inline Environment & env(void) const\
-{\
-    return Environment::getInstance();\
-}
-
-class Environment
-{
-    SINGLETON(Environment);
-public:
-    typedef SITE_SIZE_TYPE                         Size;
-    typedef std::unique_ptr<GridCartesian>         GridPt;
-    typedef std::unique_ptr<GridRedBlackCartesian> GridRbPt;
-    typedef std::unique_ptr<GridParallelRNG>       RngPt;
-    enum class Storage {object, cache, temporary};
-private:
-    struct ObjInfo
-    {
-        Size                    size{0};
-        Storage                 storage{Storage::object};
-        unsigned int            Ls{0};
-        const std::type_info    *type{nullptr};
-        std::string             name;
-        int                     module{-1};
-        std::unique_ptr<Object> data{nullptr};
-    };
-public:
-    // grids
-    void                    createGrid(const unsigned int Ls);
-    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
-    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
-    std::vector<int>        getDim(void) const;
-    int                     getDim(const unsigned int mu) const;
-    unsigned long int       getLocalVolume(void) const;
-    unsigned int            getNd(void) const;
-    // random number generator
-    void                    setSeed(const std::vector<int> &seed);
-    GridParallelRNG *       get4dRng(void) const;
-    // general memory management
-    void                    addObject(const std::string name,
-                                      const int moduleAddress = -1);
-    template <typename B, typename T, typename ... Ts>
-    void                    createDerivedObject(const std::string name,
-                                                const Environment::Storage storage,
-                                                const unsigned int Ls,
-                                                Ts && ... args);
-    template <typename T, typename ... Ts>
-    void                    createObject(const std::string name,
-                                         const Environment::Storage storage,
-                                         const unsigned int Ls,
-                                         Ts && ... args);
-    void                    setObjectModule(const unsigned int objAddress,
-                                            const int modAddress);
-    template <typename T>
-    T *                     getObject(const unsigned int address) const;
-    template <typename T>
-    T *                     getObject(const std::string name) const;
-    unsigned int            getMaxAddress(void) const;
-    unsigned int            getObjectAddress(const std::string name) const;
-    std::string             getObjectName(const unsigned int address) const;
-    std::string             getObjectType(const unsigned int address) const;
-    std::string             getObjectType(const std::string name) const;
-    Size                    getObjectSize(const unsigned int address) const;
-    Size                    getObjectSize(const std::string name) const;
-    Storage                 getObjectStorage(const unsigned int address) const;
-    Storage                 getObjectStorage(const std::string name) const;
-    int                     getObjectModule(const unsigned int address) const;
-    int                     getObjectModule(const std::string name) const;
-    unsigned int            getObjectLs(const unsigned int address) const;
-    unsigned int            getObjectLs(const std::string name) const;
-    bool                    hasObject(const unsigned int address) const;
-    bool                    hasObject(const std::string name) const;
-    bool                    hasCreatedObject(const unsigned int address) const;
-    bool                    hasCreatedObject(const std::string name) const;
-    bool                    isObject5d(const unsigned int address) const;
-    bool                    isObject5d(const std::string name) const;
-    template <typename T>
-    bool                    isObjectOfType(const unsigned int address) const;
-    template <typename T>
-    bool                    isObjectOfType(const std::string name) const;
-    Environment::Size       getTotalSize(void) const;
-    void                    freeObject(const unsigned int address);
-    void                    freeObject(const std::string name);
-    void                    freeAll(void);
-    void                    protectObjects(const bool protect);
-    bool                    objectsProtected(void) const;
-    // print environment content
-    void                    printContent(void) const;
-private:
-    // general
-    unsigned long int                      locVol_;
-    bool                                   protect_{true};
-    // grids
-    std::vector<int>                       dim_;
-    GridPt                                 grid4d_;
-    std::map<unsigned int, GridPt>         grid5d_;
-    GridRbPt                               gridRb4d_;
-    std::map<unsigned int, GridRbPt>       gridRb5d_;
-    unsigned int                           nd_;
-    // random number generator
-    RngPt                                  rng4d_;
-    // object store
-    std::vector<ObjInfo>                   object_;
-    std::map<std::string, unsigned int>    objectAddress_;
-};
-
-/******************************************************************************
- *                       Holder template implementation                       *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename T>
-Holder<T>::Holder(T *pt)
-: objPt_(pt)
-{}
-
-// access //////////////////////////////////////////////////////////////////////
-template <typename T>
-T & Holder<T>::get(void) const
-{
-    return &objPt_.get();
-}
-
-template <typename T>
-T * Holder<T>::getPt(void) const
-{
-    return objPt_.get();
-}
-
-template <typename T>
-void Holder<T>::reset(T *pt)
-{
-    objPt_.reset(pt);
-}
-
-/******************************************************************************
- *                     Environment template implementation                    *
- ******************************************************************************/
-// general memory management ///////////////////////////////////////////////////
-template <typename B, typename T, typename ... Ts>
-void Environment::createDerivedObject(const std::string name,
-                                      const Environment::Storage storage,
-                                      const unsigned int Ls,
-                                      Ts && ... args)
-{
-    if (!hasObject(name))
-    {
-        addObject(name);
-    }
-    
-    unsigned int address = getObjectAddress(name);
-    
-    if (!object_[address].data or !objectsProtected())
-    {
-        MemoryStats memStats;
-    
-        if (!MemoryProfiler::stats)
-        {
-            MemoryProfiler::stats = &memStats;
-        }
-        size_t initMem           = MemoryProfiler::stats->currentlyAllocated;
-        object_[address].storage = storage;
-        object_[address].Ls      = Ls;
-        object_[address].data.reset(new Holder<B>(new T(std::forward<Ts>(args)...)));
-        object_[address].size    = MemoryProfiler::stats->maxAllocated - initMem;
-        object_[address].type    = &typeid(T);
-        if (MemoryProfiler::stats == &memStats)
-        {
-            MemoryProfiler::stats = nullptr;
-        }
-    }
-    // object already exists, no error if it is a cache, error otherwise
-    else if ((object_[address].storage != Storage::cache) or 
-             (object_[address].storage != storage)        or
-             (object_[address].name    != name)           or
-             (object_[address].type    != &typeid(T)))
-    {
-        HADRON_ERROR(Definition, "object '" + name + "' already allocated");
-    }
-}
-
-template <typename T, typename ... Ts>
-void Environment::createObject(const std::string name, 
-                               const Environment::Storage storage,
-                               const unsigned int Ls,
-                               Ts && ... args)
-{
-    createDerivedObject<T, T>(name, storage, Ls, std::forward<Ts>(args)...);
-}
-
-template <typename T>
-T * Environment::getObject(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        if (hasCreatedObject(address))
-        {
-            if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
-            {
-                return h->getPt();
-            }
-            else
-            {
-                HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
-                            " does not have type '" + typeName(&typeid(T)) +
-                            "' (has type '" + getObjectType(address) + "')");
-            }
-        }
-        else
-        {
-            HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
-                         " is empty");
-        }
-    }
-    else
-    {
-        HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
-    }
-}
-
-template <typename T>
-T * Environment::getObject(const std::string name) const
-{
-    return getObject<T>(getObjectAddress(name));
-}
-
-template <typename T>
-bool Environment::isObjectOfType(const unsigned int address) const
-{
-    if (hasObject(address))
-    {
-        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
-        {
-            return true;
-        }
-        else
-        {
-            return false;
-        }
-    }
-    else
-    {
-        HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
-    }
-}
-
-template <typename T>
-bool Environment::isObjectOfType(const std::string name) const
-{
-    return isObjectOfType<T>(getObjectAddress(name));
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_Environment_hpp_
diff --git a/extras/Hadrons/Global.hpp b/extras/Hadrons/Global.hpp
deleted file mode 100644
index ebca2aad..00000000
--- a/extras/Hadrons/Global.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: extras/Hadrons/Global.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef Hadrons_Global_hpp_
-#define Hadrons_Global_hpp_
-
-#include <set>
-#include <stack>
-#include <Grid/Grid.h>
-#include <cxxabi.h>
-
-#ifndef SITE_SIZE_TYPE
-#define SITE_SIZE_TYPE size_t
-#endif
-
-#define BEGIN_HADRONS_NAMESPACE \
-namespace Grid {\
- \
-namespace Hadrons {\
-using Grid::operator<<;
-#define END_HADRONS_NAMESPACE }}
-
-#define BEGIN_MODULE_NAMESPACE(name)\
-namespace name {\
-using Grid::operator<<;
-#define END_MODULE_NAMESPACE }
-
-/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
- * error with GCC 5 (clang & GCC 6 compile fine without it).
- */
-
-#ifndef FIMPL
-#define FIMPL WilsonImplR
-#endif
-#ifndef SIMPL
-#define SIMPL ScalarImplCR
-#endif
-
-BEGIN_HADRONS_NAMESPACE
-
-// type aliases
-#define FERM_TYPE_ALIASES(FImpl, suffix)\
-typedef FermionOperator<FImpl>                        FMat##suffix;            \
-typedef typename FImpl::FermionField                  FermionField##suffix;    \
-typedef typename FImpl::PropagatorField               PropagatorField##suffix; \
-typedef typename FImpl::SitePropagator::scalar_object SitePropagator##suffix;  \
-typedef std::vector<SitePropagator##suffix>           SlicedPropagator##suffix;
-
-#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
-typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
-
-#define SCALAR_TYPE_ALIASES(SImpl, suffix)\
-typedef typename SImpl::Field ScalarField##suffix;\
-typedef typename SImpl::Field PropagatorField##suffix;
-
-#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
-typedef std::function<void(FermionField##suffix &,\
-                      const FermionField##suffix &)> SolverFn##suffix;
-
-#define SINK_TYPE_ALIASES(suffix)\
-typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
-
-#define FGS_TYPE_ALIASES(FImpl, suffix)\
-FERM_TYPE_ALIASES(FImpl, suffix)\
-GAUGE_TYPE_ALIASES(FImpl, suffix)\
-SOLVER_TYPE_ALIASES(FImpl, suffix)
-
-// logger
-class HadronsLogger: public Logger
-{
-public:
-    HadronsLogger(int on, std::string nm): Logger("Hadrons", on, nm,
-                                                  GridLogColours, "BLACK"){};
-};
-
-#define LOG(channel) std::cout << HadronsLog##channel
-#define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
-
-extern HadronsLogger HadronsLogError;
-extern HadronsLogger HadronsLogWarning;
-extern HadronsLogger HadronsLogMessage;
-extern HadronsLogger HadronsLogIterative;
-extern HadronsLogger HadronsLogDebug;
-
-// singleton pattern
-#define SINGLETON(name)\
-public:\
-    name(const name &e) = delete;\
-    void operator=(const name &e) = delete;\
-    static name & getInstance(void)\
-    {\
-        static name e;\
-        return e;\
-    }\
-private:\
-    name(void);
-
-#define SINGLETON_DEFCTOR(name)\
-public:\
-    name(const name &e) = delete;\
-    void operator=(const name &e) = delete;\
-    static name & getInstance(void)\
-    {\
-        static name e;\
-        return e;\
-    }\
-private:\
-    name(void) = default;
-
-// type utilities
-template <typename T>
-const std::type_info * typeIdPt(const T &x)
-{
-    return &typeid(x);
-}
-
-std::string typeName(const std::type_info *info);
-
-template <typename T>
-const std::type_info * typeIdPt(void)
-{
-    return &typeid(T);
-}
-
-template <typename T>
-std::string typeName(const T &x)
-{
-    return typeName(typeIdPt(x));
-}
-
-template <typename T>
-std::string typeName(void)
-{
-    return typeName(typeIdPt<T>());
-}
-
-// default writers/readers
-#ifdef HAVE_HDF5
-typedef Hdf5Reader CorrReader;
-typedef Hdf5Writer CorrWriter;
-#else
-typedef XmlReader CorrReader;
-typedef XmlWriter CorrWriter;
-#endif
-
-END_HADRONS_NAMESPACE
-
-#include <Grid/Hadrons/Exceptions.hpp>
-
-#endif // Hadrons_Global_hpp_
diff --git a/extras/Hadrons/HadronsXmlSchedule.cc b/extras/Hadrons/HadronsXmlSchedule.cc
deleted file mode 100644
index 2ac84817..00000000
--- a/extras/Hadrons/HadronsXmlSchedule.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: extras/Hadrons/HadronsXmlSchedule.cc
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Hadrons/Application.hpp>
-
-using namespace Grid;
- 
-using namespace Hadrons;
-
-int main(int argc, char *argv[])
-{
-    // parse command line
-    std::string parameterFileName, scheduleFileName;
-    
-    if (argc < 3)
-    {
-        std::cerr << "usage: " << argv[0] << " <parameter file> <schedule output> [Grid options]";
-        std::cerr << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    parameterFileName = argv[1];
-    scheduleFileName  = argv[2];
-    
-    // initialization
-    Grid_init(&argc, &argv);
-    HadronsLogError.Active(GridLogError.isActive());
-    HadronsLogWarning.Active(GridLogWarning.isActive());
-    HadronsLogMessage.Active(GridLogMessage.isActive());
-    HadronsLogIterative.Active(GridLogIterative.isActive());
-    HadronsLogDebug.Active(GridLogDebug.isActive());
-    LOG(Message) << "Grid initialized" << std::endl;
-    
-    // execution
-    Application application;
-    
-    application.parseParameterFile(parameterFileName);
-    application.schedule();
-    application.printSchedule();
-    application.saveSchedule(scheduleFileName);
-    
-    // epilogue
-    LOG(Message) << "Grid is finalizing now" << std::endl;
-    Grid_finalize();
-    
-    return EXIT_SUCCESS;
-}
diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
deleted file mode 100644
index cf381d0f..00000000
--- a/extras/Hadrons/Modules.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: extras/Hadrons/Modules.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-Copyright (C) 2017
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
-#include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
-#include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
-#include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
-#include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
-#include <Grid/Hadrons/Modules/MContraction/WardIdentity.hpp>
-#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
-#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
-#include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
-#include <Grid/Hadrons/Modules/MSource/Point.hpp>
-#include <Grid/Hadrons/Modules/MSource/Wall.hpp>
-#include <Grid/Hadrons/Modules/MSource/Z2.hpp>
-#include <Grid/Hadrons/Modules/MSource/SeqConserved.hpp>
-#include <Grid/Hadrons/Modules/MSink/Smear.hpp>
-#include <Grid/Hadrons/Modules/MSink/Point.hpp>
-#include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
-#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
-#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
-#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
-#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
-#include <Grid/Hadrons/Modules/MUtilities/TestSeqGamma.hpp>
-#include <Grid/Hadrons/Modules/MUtilities/TestSeqConserved.hpp>
-#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
-#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
-#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
-#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
-#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
-#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.cc b/extras/Hadrons/Modules/MScalar/ChargedProp.cc
deleted file mode 100644
index 4668544c..00000000
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
-#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-using namespace MScalar;
-
-/******************************************************************************
-*                     TChargedProp implementation                             *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-TChargedProp::TChargedProp(const std::string name)
-: Module<ChargedPropPar>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> TChargedProp::getInput(void)
-{
-    std::vector<std::string> in = {par().source, par().emField};
-    
-    return in;
-}
-
-std::vector<std::string> TChargedProp::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void TChargedProp::setup(void)
-{
-    freeMomPropName_ = FREEMOMPROP(par().mass);
-    phaseName_.clear();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
-    }
-    GFSrcName_ = getName() + "_DinvSrc";
-    fftName_   = getName() + "_fft";
-
-    freeMomPropDone_ = env().hasCreatedObject(freeMomPropName_);
-    GFSrcDone_       = env().hasCreatedObject(GFSrcName_);
-    phasesDone_      = env().hasCreatedObject(phaseName_[0]);
-    envCacheLat(ScalarField, freeMomPropName_);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        envCacheLat(ScalarField, phaseName_[mu]);
-    }
-    envCacheLat(ScalarField, GFSrcName_);
-    envCreateLat(ScalarField, getName());
-    envTmpLat(ScalarField, "buf");
-    envTmpLat(ScalarField, "result");
-    envTmpLat(ScalarField, "Amu");
-    envCache(FFT, fftName_, 1, env().getGrid());
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void TChargedProp::execute(void)
-{
-    // CACHING ANALYTIC EXPRESSIONS
-    makeCaches();
-
-    // PROPAGATOR CALCULATION
-    LOG(Message) << "Computing charged scalar propagator"
-                 << " (mass= " << par().mass
-                 << ", charge= " << par().charge << ")..." << std::endl;
-    
-    auto   &prop  = envGet(ScalarField, getName());
-    auto   &GFSrc = envGet(ScalarField, GFSrcName_);
-    auto   &G     = envGet(ScalarField, freeMomPropName_);
-    auto   &fft   = envGet(FFT, fftName_);
-    double q      = par().charge;
-    envGetTmp(ScalarField, result); 
-    envGetTmp(ScalarField, buf); 
-
-    // G*F*Src
-    prop = GFSrc;
-
-    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
-    buf = GFSrc;
-    momD1(buf, fft);
-    buf = G*buf;
-    prop = prop - q*buf;
-
-    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
-    momD1(buf, fft);
-    prop = prop + q*q*G*buf;
-
-    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
-    buf = GFSrc;
-    momD2(buf, fft);
-    prop = prop - q*q*G*buf;
-
-    // final FT
-    fft.FFT_all_dim(prop, prop, FFT::backward);
-    
-    // OUTPUT IF NECESSARY
-    if (!par().output.empty())
-    {
-        std::string           filename = par().output + "." +
-                                         std::to_string(vm().getTrajectory());
-        
-        LOG(Message) << "Saving zero-momentum projection to '"
-                     << filename << "'..." << std::endl;
-        
-        CorrWriter            writer(filename);
-        std::vector<TComplex> vecBuf;
-        std::vector<Complex>  result;
-        
-        sliceSum(prop, vecBuf, Tp);
-        result.resize(vecBuf.size());
-        for (unsigned int t = 0; t < vecBuf.size(); ++t)
-        {
-            result[t] = TensorRemove(vecBuf[t]);
-        }
-        write(writer, "charge", q);
-        write(writer, "prop", result);
-    }
-}
-
-void TChargedProp::makeCaches(void)
-{
-    auto &freeMomProp = envGet(ScalarField, freeMomPropName_);
-    auto &GFSrc       = envGet(ScalarField, GFSrcName_);
-    auto &fft         = envGet(FFT, fftName_);
-
-    if (!freeMomPropDone_)
-    {
-        LOG(Message) << "Caching momentum space free scalar propagator"
-                     << " (mass= " << par().mass << ")..." << std::endl;
-        SIMPL::MomentumSpacePropagator(freeMomProp, par().mass);
-    }
-    if (!GFSrcDone_)
-    {   
-        FFT  fft(env().getGrid());
-        auto &source = envGet(ScalarField, par().source);
-
-        LOG(Message) << "Caching G*F*src..." << std::endl;
-        fft.FFT_all_dim(GFSrc, source, FFT::forward);
-        GFSrc = freeMomProp*GFSrc;
-    }
-    if (!phasesDone_)
-    {
-        Coordinate &l = env().getGrid()->_fdimensions;
-        Complex          ci(0.0,1.0);
-        
-        LOG(Message) << "Caching shift phases..." << std::endl;
-        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-        {
-            Real twoPiL = M_PI*2./l[mu];
-            auto &phmu  = envGet(ScalarField, phaseName_[mu]);
-            
-            LatticeCoordinate(phmu, mu);
-            phmu = exp(ci*twoPiL*phmu);
-            phase_.push_back(&phmu);
-        }
-    }
-}
-
-void TChargedProp::momD1(ScalarField &s, FFT &fft)
-{
-    auto        &A = envGet(EmField, par().emField);
-    Complex     ci(0.0,1.0);
-
-    envGetTmp(ScalarField, buf);
-    envGetTmp(ScalarField, result);
-    envGetTmp(ScalarField, Amu);
-
-    result = Zero();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);
-        buf = (*phase_[mu])*s;
-        fft.FFT_all_dim(buf, buf, FFT::backward);
-        buf = Amu*buf;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result - ci*buf;
-    }
-    fft.FFT_all_dim(s, s, FFT::backward);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);
-        buf = Amu*s;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result + ci*adj(*phase_[mu])*buf;
-    }
-
-    s = result;
-}
-
-void TChargedProp::momD2(ScalarField &s, FFT &fft)
-{
-    auto &A = envGet(EmField, par().emField);
-
-    envGetTmp(ScalarField, buf);
-    envGetTmp(ScalarField, result);
-    envGetTmp(ScalarField, Amu);
-
-    result = Zero();
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);
-        buf = (*phase_[mu])*s;
-        fft.FFT_all_dim(buf, buf, FFT::backward);
-        buf = Amu*Amu*buf;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result + .5*buf;
-    }
-    fft.FFT_all_dim(s, s, FFT::backward);
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    {
-        Amu = peekLorentz(A, mu);        
-        buf = Amu*Amu*s;
-        fft.FFT_all_dim(buf, buf, FFT::forward);
-        result = result + .5*adj(*phase_[mu])*buf;
-    }
-
-    s = result;
-}
diff --git a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp b/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
deleted file mode 100644
index cfcce28e..00000000
--- a/extras/Hadrons/Modules/MScalar/ChargedProp.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef Hadrons_MScalar_ChargedProp_hpp_
-#define Hadrons_MScalar_ChargedProp_hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                       Charged scalar propagator                            *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MScalar)
-
-class ChargedPropPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
-                                    std::string, emField,
-                                    std::string, source,
-                                    double,      mass,
-                                    double,      charge,
-                                    std::string, output);
-};
-
-class TChargedProp: public Module<ChargedPropPar>
-{
-public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
-    typedef PhotonR::GaugeField     EmField;
-    typedef PhotonR::GaugeLinkField EmComp;
-public:
-    // constructor
-    TChargedProp(const std::string name);
-    // destructor
-    virtual ~TChargedProp(void) = default;
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-private:
-    void makeCaches(void);
-    void momD1(ScalarField &s, FFT &fft);
-    void momD2(ScalarField &s, FFT &fft);
-private:
-    bool                       freeMomPropDone_, GFSrcDone_, phasesDone_;
-    std::string                freeMomPropName_, GFSrcName_, fftName_;
-    std::vector<std::string>   phaseName_;
-    std::vector<ScalarField *> phase_;
-};
-
-MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MScalar_ChargedProp_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/FreeProp.hpp b/extras/Hadrons/Modules/MScalar/FreeProp.hpp
deleted file mode 100644
index 6b956134..00000000
--- a/extras/Hadrons/Modules/MScalar/FreeProp.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef Hadrons_MScalar_FreeProp_hpp_
-#define Hadrons_MScalar_FreeProp_hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                               FreeProp                                     *
- ******************************************************************************/
-BEGIN_MODULE_NAMESPACE(MScalar)
-
-class FreePropPar: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
-                                    std::string, source,
-                                    double,      mass,
-                                    std::string, output);
-};
-
-class TFreeProp: public Module<FreePropPar>
-{
-public:
-    SCALAR_TYPE_ALIASES(SIMPL,);
-public:
-    // constructor
-    TFreeProp(const std::string name);
-    // destructor
-    virtual ~TFreeProp(void) = default;
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-protected:
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-private:
-    std::string freeMomPropName_;
-    bool        freePropDone_;
-};
-
-MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons_MScalar_FreeProp_hpp_
diff --git a/extras/Hadrons/Modules/MScalar/Scalar.hpp b/extras/Hadrons/Modules/MScalar/Scalar.hpp
deleted file mode 100644
index db702ff2..00000000
--- a/extras/Hadrons/Modules/MScalar/Scalar.hpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef Hadrons_Scalar_hpp_
-#define Hadrons_Scalar_hpp_
-
-#define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
-
-#endif // Hadrons_Scalar_hpp_
diff --git a/extras/Hadrons/Modules/templates/Module.cc.template b/extras/Hadrons/Modules/templates/Module.cc.template
deleted file mode 100644
index 0c509d6d..00000000
--- a/extras/Hadrons/Modules/templates/Module.cc.template
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <Grid/Hadrons/Modules/___FILEBASENAME___.hpp>
-
-using namespace Grid;
-using namespace Hadrons;
-
-/******************************************************************************
-*                  T___FILEBASENAME___ implementation                             *
-******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-T___FILEBASENAME___::T___FILEBASENAME___(const std::string name)
-: Module<___FILEBASENAME___Par>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-std::vector<std::string> T___FILEBASENAME___::getInput(void)
-{
-    std::vector<std::string> in;
-    
-    return in;
-}
-
-std::vector<std::string> T___FILEBASENAME___::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-void T___FILEBASENAME___::setup(void)
-{
-
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-void T___FILEBASENAME___::execute(void)
-{
-
-}
diff --git a/extras/Hadrons/Modules/templates/Module.hpp.template b/extras/Hadrons/Modules/templates/Module.hpp.template
deleted file mode 100644
index fb43260f..00000000
--- a/extras/Hadrons/Modules/templates/Module.hpp.template
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         ___FILEBASENAME___                                 *
- ******************************************************************************/
-class ___FILEBASENAME___Par: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(___FILEBASENAME___Par, 
-                                    unsigned int, i);
-};
-
-class T___FILEBASENAME___: public Module<___FILEBASENAME___Par>
-{
-public:
-    // constructor
-    T___FILEBASENAME___(const std::string name);
-    // destructor
-    virtual ~T___FILEBASENAME___(void) = default;
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER(___FILEBASENAME___, T___FILEBASENAME___);
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons____FILEBASENAME____hpp_
diff --git a/extras/Hadrons/Modules/templates/Module_tmp.hpp.template b/extras/Hadrons/Modules/templates/Module_tmp.hpp.template
deleted file mode 100644
index 2ee053a9..00000000
--- a/extras/Hadrons/Modules/templates/Module_tmp.hpp.template
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef Hadrons____FILEBASENAME____hpp_
-#define Hadrons____FILEBASENAME____hpp_
-
-#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Hadrons/Module.hpp>
-#include <Grid/Hadrons/ModuleFactory.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-/******************************************************************************
- *                         ___FILEBASENAME___                                 *
- ******************************************************************************/
-class ___FILEBASENAME___Par: Serializable
-{
-public:
-    GRID_SERIALIZABLE_CLASS_MEMBERS(___FILEBASENAME___Par,
-                                    unsigned int, i);
-};
-
-template <typename FImpl>
-class T___FILEBASENAME___: public Module<___FILEBASENAME___Par>
-{
-public:
-    // constructor
-    T___FILEBASENAME___(const std::string name);
-    // destructor
-    virtual ~T___FILEBASENAME___(void) = default;
-    // dependency relation
-    virtual std::vector<std::string> getInput(void);
-    virtual std::vector<std::string> getOutput(void);
-    // setup
-    virtual void setup(void);
-    // execution
-    virtual void execute(void);
-};
-
-MODULE_REGISTER(___FILEBASENAME___, T___FILEBASENAME___<FIMPL>);
-
-/******************************************************************************
- *                 T___FILEBASENAME___ implementation                             *
- ******************************************************************************/
-// constructor /////////////////////////////////////////////////////////////////
-template <typename FImpl>
-T___FILEBASENAME___<FImpl>::T___FILEBASENAME___(const std::string name)
-: Module<___FILEBASENAME___Par>(name)
-{}
-
-// dependencies/products ///////////////////////////////////////////////////////
-template <typename FImpl>
-std::vector<std::string> T___FILEBASENAME___<FImpl>::getInput(void)
-{
-    std::vector<std::string> in;
-    
-    return in;
-}
-
-template <typename FImpl>
-std::vector<std::string> T___FILEBASENAME___<FImpl>::getOutput(void)
-{
-    std::vector<std::string> out = {getName()};
-    
-    return out;
-}
-
-// setup ///////////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void T___FILEBASENAME___<FImpl>::setup(void)
-{
-    
-}
-
-// execution ///////////////////////////////////////////////////////////////////
-template <typename FImpl>
-void T___FILEBASENAME___<FImpl>::execute(void)
-{
-    
-}
-
-END_HADRONS_NAMESPACE
-
-#endif // Hadrons____FILEBASENAME____hpp_
diff --git a/extras/Hadrons/add_module.sh b/extras/Hadrons/add_module.sh
deleted file mode 100755
index d5d23ea4..00000000
--- a/extras/Hadrons/add_module.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-if (( $# != 1 && $# != 2)); then
-    echo "usage: `basename $0` <module name> [<namespace>]" 1>&2
-    exit 1
-fi
-NAME=$1
-NS=$2
-
-if (( $# == 1 )); then
-	if [ -e "Modules/${NAME}.cc" ] || [ -e "Modules/${NAME}.hpp" ]; then
-	    echo "error: files Modules/${NAME}.* already exists" 1>&2
-	    exit 1
-	fi
-	sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module.cc.template > Modules/${NAME}.cc
-	sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module.hpp.template > Modules/${NAME}.hpp
-elif (( $# == 2 )); then
-	mkdir -p Modules/${NS}
-	if [ -e "Modules/${NS}/${NAME}.cc" ] || [ -e "Modules/${NS}/${NAME}.hpp" ]; then
-	    echo "error: files Modules/${NS}/${NAME}.* already exists" 1>&2
-	    exit 1
-	fi
-	TMPCC=".${NS}.${NAME}.tmp.cc"
-	TMPHPP=".${NS}.${NAME}.tmp.hpp"
-	sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_in_NS.cc.template  > ${TMPCC}
-	sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_in_NS.hpp.template > ${TMPHPP}
-	sed "s/___NAMESPACE___/${NS}/g" ${TMPCC}  > Modules/${NS}/${NAME}.cc
-	sed "s/___NAMESPACE___/${NS}/g" ${TMPHPP} > Modules/${NS}/${NAME}.hpp
-	rm -f ${TMPCC} ${TMPHPP}
-fi
-./make_module_list.sh
diff --git a/extras/Hadrons/add_module_template.sh b/extras/Hadrons/add_module_template.sh
deleted file mode 100755
index 0069fcea..00000000
--- a/extras/Hadrons/add_module_template.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-if (( $# != 1 && $# != 2)); then
-    echo "usage: `basename $0` <module name> [<namespace>]" 1>&2
-    exit 1
-fi
-NAME=$1
-NS=$2
-
-if (( $# == 1 )); then
-	if [ -e "Modules/${NAME}.cc" ] || [ -e "Modules/${NAME}.hpp" ]; then
-	    echo "error: files Modules/${NAME}.* already exists" 1>&2
-	    exit 1
-	fi
-	sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_tmp.hpp.template > Modules/${NAME}.hpp
-elif (( $# == 2 )); then
-	mkdir -p Modules/${NS}
-	if [ -e "Modules/${NS}/${NAME}.cc" ] || [ -e "Modules/${NS}/${NAME}.hpp" ]; then
-	    echo "error: files Modules/${NS}/${NAME}.* already exists" 1>&2
-	    exit 1
-	fi
-	TMPCC=".${NS}.${NAME}.tmp.cc"
-	TMPHPP=".${NS}.${NAME}.tmp.hpp"
-	sed "s/___FILEBASENAME___/${NAME}/g" Modules/templates/Module_tmp_in_NS.hpp.template > ${TMPHPP}
-	sed "s/___NAMESPACE___/${NS}/g" ${TMPHPP} > Modules/${NS}/${NAME}.hpp
-	rm -f ${TMPCC} ${TMPHPP}
-fi
-./make_module_list.sh
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
deleted file mode 100644
index 199bb5cd..00000000
--- a/extras/Hadrons/modules.inc
+++ /dev/null
@@ -1,43 +0,0 @@
-modules_cc =\
-  Modules/MContraction/WeakHamiltonianEye.cc \
-  Modules/MContraction/WeakNeutral4ptDisc.cc \
-  Modules/MContraction/WeakHamiltonianNonEye.cc \
-  Modules/MGauge/Load.cc \
-  Modules/MGauge/Unit.cc \
-  Modules/MGauge/StochEm.cc \
-  Modules/MGauge/Random.cc \
-  Modules/MScalar/FreeProp.cc \
-  Modules/MScalar/ChargedProp.cc
-
-modules_hpp =\
-  Modules/MContraction/Baryon.hpp \
-  Modules/MContraction/Meson.hpp \
-  Modules/MContraction/WeakHamiltonian.hpp \
-  Modules/MContraction/WeakHamiltonianNonEye.hpp \
-  Modules/MContraction/DiscLoop.hpp \
-  Modules/MContraction/WeakNeutral4ptDisc.hpp \
-  Modules/MContraction/Gamma3pt.hpp \
-  Modules/MContraction/WardIdentity.hpp \
-  Modules/MContraction/WeakHamiltonianEye.hpp \
-  Modules/MFermion/GaugeProp.hpp \
-  Modules/MSource/SeqGamma.hpp \
-  Modules/MSource/Point.hpp \
-  Modules/MSource/Wall.hpp \
-  Modules/MSource/Z2.hpp \
-  Modules/MSource/SeqConserved.hpp \
-  Modules/MSink/Smear.hpp \
-  Modules/MSink/Point.hpp \
-  Modules/MSolver/RBPrecCG.hpp \
-  Modules/MGauge/Load.hpp \
-  Modules/MGauge/Unit.hpp \
-  Modules/MGauge/Random.hpp \
-  Modules/MGauge/StochEm.hpp \
-  Modules/MUtilities/TestSeqGamma.hpp \
-  Modules/MUtilities/TestSeqConserved.hpp \
-  Modules/MLoop/NoiseLoop.hpp \
-  Modules/MScalar/FreeProp.hpp \
-  Modules/MScalar/Scalar.hpp \
-  Modules/MScalar/ChargedProp.hpp \
-  Modules/MAction/DWF.hpp \
-  Modules/MAction/Wilson.hpp
-
diff --git a/extras/Makefile.am b/extras/Makefile.am
deleted file mode 100644
index d8c2b675..00000000
--- a/extras/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = Hadrons
\ No newline at end of file
diff --git a/extras/qed-fvol/Global.cc b/extras/qed-fvol/Global.cc
deleted file mode 100644
index 57ed97cc..00000000
--- a/extras/qed-fvol/Global.cc
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <qed-fvol/Global.hpp>
-
-using namespace Grid;
-using namespace QCD;
-using namespace QedFVol;
-
-QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
-QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
-QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
-QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
-QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");
diff --git a/extras/qed-fvol/Global.hpp b/extras/qed-fvol/Global.hpp
deleted file mode 100644
index 7f07200d..00000000
--- a/extras/qed-fvol/Global.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef QedFVol_Global_hpp_
-#define QedFVol_Global_hpp_
-
-#include <Grid/Grid.h>
-
-#define BEGIN_QEDFVOL_NAMESPACE \
-namespace Grid {\
-using namespace QCD;\
-namespace QedFVol {\
-using Grid::operator<<;
-#define END_QEDFVOL_NAMESPACE }}
-
-/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
- * error with GCC (clang compiles fine without it).
- */
-
-BEGIN_QEDFVOL_NAMESPACE
-
-class QedFVolLogger: public Logger
-{
-public:
-    QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
-                                                  GridLogColours, "BLACK"){};
-};
-
-#define LOG(channel) std::cout << QedFVolLog##channel
-#define QEDFVOL_ERROR(msg)\
-LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
-           << __LINE__ << ")" << std::endl;\
-abort();
-
-#define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
-
-extern QedFVolLogger QedFVolLogError;
-extern QedFVolLogger QedFVolLogWarning;
-extern QedFVolLogger QedFVolLogMessage;
-extern QedFVolLogger QedFVolLogIterative;
-extern QedFVolLogger QedFVolLogDebug;
-
-END_QEDFVOL_NAMESPACE
-
-#endif // QedFVol_Global_hpp_
diff --git a/extras/qed-fvol/Makefile.am b/extras/qed-fvol/Makefile.am
deleted file mode 100644
index 0a9030c7..00000000
--- a/extras/qed-fvol/Makefile.am
+++ /dev/null
@@ -1,9 +0,0 @@
-AM_CXXFLAGS += -I$(top_srcdir)/extras
-
-bin_PROGRAMS = qed-fvol
-
-qed_fvol_SOURCES =   \
-    qed-fvol.cc      \
-    Global.cc
-
-qed_fvol_LDADD   = -lGrid
diff --git a/extras/qed-fvol/WilsonLoops.h b/extras/qed-fvol/WilsonLoops.h
deleted file mode 100644
index 8d2f3c43..00000000
--- a/extras/qed-fvol/WilsonLoops.h
+++ /dev/null
@@ -1,265 +0,0 @@
-#ifndef QEDFVOL_WILSONLOOPS_H
-#define QEDFVOL_WILSONLOOPS_H
-
-#include <Global.hpp>
-
-BEGIN_QEDFVOL_NAMESPACE
-
-template <class Gimpl> class NewWilsonLoops : public Gimpl {
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  typedef typename Gimpl::GaugeLinkField GaugeMat;
-  typedef typename Gimpl::GaugeField GaugeLorentz;
-
-  //////////////////////////////////////////////////
-  // directed plaquette oriented in mu,nu plane
-  //////////////////////////////////////////////////
-  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
-                           const int mu, const int nu) {
-    // Annoyingly, must use either scope resolution to find dependent base
-    // class,
-    // or this-> ; there is no "this" in a static method. This forces explicit
-    // Gimpl scope
-    // resolution throughout the usage in this file, and rather defeats the
-    // purpose of deriving
-    // from Gimpl.
-    plaq = Gimpl::CovShiftBackward(
-        U[mu], mu, Gimpl::CovShiftBackward(
-                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
-  }
-  //////////////////////////////////////////////////
-  // trace of directed plaquette oriented in mu,nu plane
-  //////////////////////////////////////////////////
-  static void traceDirPlaquette(LatticeComplex &plaq,
-                                const std::vector<GaugeMat> &U, const int mu,
-                                const int nu) {
-    GaugeMat sp(U[0].Grid());
-    dirPlaquette(sp, U, mu, nu);
-    plaq = trace(sp);
-  }
-  //////////////////////////////////////////////////
-  // sum over all planes of plaquette
-  //////////////////////////////////////////////////
-  static void sitePlaquette(LatticeComplex &Plaq,
-                            const std::vector<GaugeMat> &U) {
-    LatticeComplex sitePlaq(U[0].Grid());
-    Plaq = Zero();
-    for (int mu = 1; mu < U[0].Grid()->_ndimension; mu++) {
-      for (int nu = 0; nu < mu; nu++) {
-        traceDirPlaquette(sitePlaq, U, mu, nu);
-        Plaq = Plaq + sitePlaq;
-      }
-    }
-  }
-  //////////////////////////////////////////////////
-  // sum over all x,y,z,t and over all planes of plaquette
-  //////////////////////////////////////////////////
-  static Real sumPlaquette(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
-
-    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-    }
-
-    LatticeComplex Plaq(Umu.Grid());
-
-    sitePlaquette(Plaq, U);
-
-    TComplex Tp = sum(Plaq);
-    Complex p = TensorRemove(Tp);
-    return p.real();
-  }
-  //////////////////////////////////////////////////
-  // average over all x,y,z,t and over all planes of plaquette
-  //////////////////////////////////////////////////
-  static Real avgPlaquette(const GaugeLorentz &Umu) {
-    int ndim = Umu.Grid()->_ndimension;
-    Real sumplaq = sumPlaquette(Umu);
-    Real vol = Umu.Grid()->gSites();
-    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
-    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
-  }
-
-  //////////////////////////////////////////////////
-  // Wilson loop of size (R1, R2), oriented in mu,nu plane
-  //////////////////////////////////////////////////
-  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
-                           const int Rmu, const int Rnu,
-                           const int mu, const int nu) {
-    wl = U[nu];
-
-    for(int i = 0; i < Rnu-1; i++){
-      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
-    }
-
-    for(int i = 0; i < Rmu; i++){
-      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
-    }
-
-    for(int i = 0; i < Rnu; i++){
-      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
-    }
-
-    for(int i = 0; i < Rmu; i++){
-      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
-    }
-  }
-  //////////////////////////////////////////////////
-  // trace of Wilson Loop oriented in mu,nu plane
-  //////////////////////////////////////////////////
-  static void traceWilsonLoop(LatticeComplex &wl,
-                                const std::vector<GaugeMat> &U,
-                                const int Rmu, const int Rnu,
-                                const int mu, const int nu) {
-    GaugeMat sp(U[0].Grid());
-    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
-    wl = trace(sp);
-  }
-  //////////////////////////////////////////////////
-  // sum over all planes of Wilson loop
-  //////////////////////////////////////////////////
-  static void siteWilsonLoop(LatticeComplex &Wl,
-                            const std::vector<GaugeMat> &U,
-                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0].Grid());
-    Wl = Zero();
-    for (int mu = 1; mu < U[0].Grid()->_ndimension; mu++) {
-      for (int nu = 0; nu < mu; nu++) {
-        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
-        Wl = Wl + siteWl;
-        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
-        Wl = Wl + siteWl;
-      }
-    }
-  }
-  //////////////////////////////////////////////////
-  // sum over planes of Wilson loop with length R1
-  // in the time direction
-  //////////////////////////////////////////////////
-  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
-                            const std::vector<GaugeMat> &U,
-                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0].Grid());
-
-    int ndim = U[0].Grid()->_ndimension;
-
-    Wl = Zero();
-    for (int nu = 0; nu < ndim - 1; nu++) {
-      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
-      Wl = Wl + siteWl;
-    }
-  }
-  //////////////////////////////////////////////////
-  // sum Wilson loop over all planes orthogonal to the time direction
-  //////////////////////////////////////////////////
-  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
-                            const std::vector<GaugeMat> &U,
-                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0].Grid());
-
-    Wl = Zero();
-    for (int mu = 1; mu < U[0].Grid()->_ndimension - 1; mu++) {
-      for (int nu = 0; nu < mu; nu++) {
-        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
-        Wl = Wl + siteWl;
-        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
-        Wl = Wl + siteWl;
-      }
-    }
-  }
-  //////////////////////////////////////////////////
-  // sum over all x,y,z,t and over all planes of Wilson loop
-  //////////////////////////////////////////////////
-  static Real sumWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
-
-    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-    }
-
-    LatticeComplex Wl(Umu.Grid());
-
-    siteWilsonLoop(Wl, U, R1, R2);
-
-    TComplex Tp = sum(Wl);
-    Complex p = TensorRemove(Tp);
-    return p.real();
-  }
-  //////////////////////////////////////////////////
-  // sum over all x,y,z,t and over all planes of timelike Wilson loop
-  //////////////////////////////////////////////////
-  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
-
-    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-    }
-
-    LatticeComplex Wl(Umu.Grid());
-
-    siteTimelikeWilsonLoop(Wl, U, R1, R2);
-
-    TComplex Tp = sum(Wl);
-    Complex p = TensorRemove(Tp);
-    return p.real();
-  }
-  //////////////////////////////////////////////////
-  // sum over all x,y,z,t and over all planes of spatial Wilson loop
-  //////////////////////////////////////////////////
-  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
-
-    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
-      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
-    }
-
-    LatticeComplex Wl(Umu.Grid());
-
-    siteSpatialWilsonLoop(Wl, U, R1, R2);
-
-    TComplex Tp = sum(Wl);
-    Complex p = TensorRemove(Tp);
-    return p.real();
-  }
-  //////////////////////////////////////////////////
-  // average over all x,y,z,t and over all planes of Wilson loop
-  //////////////////////////////////////////////////
-  static Real avgWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    int ndim = Umu.Grid()->_ndimension;
-    Real sumWl = sumWilsonLoop(Umu, R1, R2);
-    Real vol = Umu.Grid()->gSites();
-    Real faces = 1.0 * ndim * (ndim - 1);
-    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
-  }
-  //////////////////////////////////////////////////
-  // average over all x,y,z,t and over all planes of timelike Wilson loop
-  //////////////////////////////////////////////////
-  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    int ndim = Umu.Grid()->_ndimension;
-    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
-    Real vol = Umu.Grid()->gSites();
-    Real faces = 1.0 * (ndim - 1);
-    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
-  }
-  //////////////////////////////////////////////////
-  // average over all x,y,z,t and over all planes of spatial Wilson loop
-  //////////////////////////////////////////////////
-  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    int ndim = Umu.Grid()->_ndimension;
-    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
-    Real vol = Umu.Grid()->gSites();
-    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
-    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
-  }
-};
-
-END_QEDFVOL_NAMESPACE
-
-#endif // QEDFVOL_WILSONLOOPS_H
diff --git a/extras/qed-fvol/qed-fvol.cc b/extras/qed-fvol/qed-fvol.cc
deleted file mode 100644
index 3ecac2fc..00000000
--- a/extras/qed-fvol/qed-fvol.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-#include <Global.hpp>
-#include <WilsonLoops.h>
-
-using namespace Grid;
-using namespace QCD;
-using namespace QedFVol;
-
-typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
-typedef PhotonR::GaugeField             EmField;
-typedef PhotonR::GaugeLinkField         EmComp;
-
-const int NCONFIGS = 10;
-const int NWILSON = 10;
-
-int main(int argc, char *argv[])
-{
-    // parse command line
-    std::string parameterFileName;
-    
-    if (argc < 2)
-    {
-        std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
-        std::cerr << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
-    parameterFileName = argv[1];
-    
-    // initialization
-    Grid_init(&argc, &argv);
-    QedFVolLogError.Active(GridLogError.isActive());
-    QedFVolLogWarning.Active(GridLogWarning.isActive());
-    QedFVolLogMessage.Active(GridLogMessage.isActive());
-    QedFVolLogIterative.Active(GridLogIterative.isActive());
-    QedFVolLogDebug.Active(GridLogDebug.isActive());
-    LOG(Message) << "Grid initialized" << std::endl;
-    
-    // QED stuff
-    std::vector<int> latt_size   = GridDefaultLatt();
-    std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
-    std::vector<int> mpi_layout  = GridDefaultMpi();
-    GridCartesian    grid(latt_size,simd_layout,mpi_layout);
-    GridParallelRNG  pRNG(&grid);
-    PhotonR          photon(PhotonR::Gauge::feynman,
-                            PhotonR::ZmScheme::qedL);
-    EmField          a(&grid);
-    EmField          expA(&grid);
-
-    Complex imag_unit(0, 1);
-
-    Real wlA;
-    std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
-
-    pRNG.SeedRandomDevice();
-
-    LOG(Message) << "Wilson loop calculation beginning" << std::endl;
-    for(int ic = 0; ic < NCONFIGS; ic++){
-        LOG(Message) << "Configuration " << ic <<std::endl;
-        photon.StochasticField(a, pRNG);
-
-        // Exponentiate photon field
-        expA = exp(imag_unit*a);
-
-        // Calculate Wilson loops
-        for(int iw=1; iw<=NWILSON; iw++){
-            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
-            logWlAvg[iw-1] -= 2*log(wlA);
-            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
-            logWlTime[iw-1] -= 2*log(wlA);
-            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
-            logWlSpace[iw-1] -= 2*log(wlA);
-        }
-    }
-    LOG(Message) << "Wilson loop calculation completed" << std::endl;
-    
-    // Calculate Wilson loops
-    for(int iw=1; iw<=10; iw++){
-        LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
-        LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
-        LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
-        LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
-    }
-
-    // epilogue
-    LOG(Message) << "Grid is finalizing now" << std::endl;
-    Grid_finalize();
-    
-    return EXIT_SUCCESS;
-}
diff --git a/grid-config.in b/grid-config.in
index bd340846..f39b01bd 100755
--- a/grid-config.in
+++ b/grid-config.in
@@ -61,6 +61,10 @@ while test $# -gt 0; do
       echo @GRID_CXXFLAGS@
     ;;
     
+    --cxx)
+      echo @GRID_CXX@
+    ;;
+    
     --ldflags)
       echo @GRID_LDFLAGS@
     ;;
diff --git a/lib/.dirstamp b/lib/.dirstamp
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/Hadrons b/lib/Hadrons
deleted file mode 120000
index 1f422592..00000000
--- a/lib/Hadrons
+++ /dev/null
@@ -1 +0,0 @@
-../extras/Hadrons
\ No newline at end of file
diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
deleted file mode 100644
index 60a388bd..00000000
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ /dev/null
@@ -1,607 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
-
-Copyright (C) 2017
-
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
-#define GRID_BLOCK_CONJUGATE_GRADIENT_H
-
-#include <Grid/lattice/Lattice_matrix_reduction.h>
-
-NAMESPACE_BEGIN(Grid);
-
-enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
-
-//////////////////////////////////////////////////////////////////////////
-// Block Conjugate gradient. Dimension zero should be the block direction
-//////////////////////////////////////////////////////////////////////////
-template <class Field>
-class BlockConjugateGradient : public OperatorFunction<Field> {
-public:
-
-
-  typedef typename Field::scalar_type scomplex;
-
-  int blockDim ;
-  int Nblock;
-
-  BlockCGtype CGtype;
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-                           // Defaults true.
-  RealD Tolerance;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  
-  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
-  {};
-
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Thin QR factorisation (google it)
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  void ThinQRfact (Eigen::MatrixXcd &m_rr,
-		   Eigen::MatrixXcd &C,
-		   Eigen::MatrixXcd &Cinv,
-		   Field & Q,
-		   const Field & R)
-  {
-    int Orthog = blockDim; // First dimension is block dim; this is an assumption
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    //Dimensions
-    // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
-    //
-    // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
-    //
-    //   Q  C = R => Q = R C^{-1}
-    //
-    // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
-    //
-    // Set C = L^{dag}, and then Q^dag Q = ident 
-    //
-    // Checks:
-    // Cdag C = Rdag R ; passes.
-    // QdagQ  = 1      ; passes
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-    // Force manifest hermitian to avoid rounding related
-    m_rr = 0.5*(m_rr+m_rr.adjoint());
-
-#if 0
-    std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
-    Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
-    std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
-    auto  D_ldlt = m_rr.ldlt().vectorD(); 
-    std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
-#endif
-
-    //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
-    Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
-    //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
-    C    = L.adjoint();
-    Cinv = C.inverse();
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Q = R C^{-1}
-    //
-    // Q_j  = R_i Cinv(i,j) 
-    //
-    // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    sliceMulMatrix(Q,Cinv,R,Orthog);
-  }
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Call one of several implementations
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-  {
-    if ( CGtype == BlockCGrQ ) {
-      BlockCGrQsolve(Linop,Src,Psi);
-    } else if (CGtype == BlockCG ) {
-      BlockCGsolve(Linop,Src,Psi);
-    } else if (CGtype == CGmultiRHS ) {
-      CGmultiRHSsolve(Linop,Src,Psi);
-    } else {
-      assert(0);
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////
-  // BlockCGrQ implementation:
-  //--------------------------
-  // X is guess/Solution
-  // B is RHS
-  // Solve A X_i = B_i    ;        i refers to Nblock index
-  ////////////////////////////////////////////////////////////////////////////
-  void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
-  {
-    int Orthog = blockDim; // First dimension is block dim; this is an assumption
-    Nblock = B.Grid()->_fdimensions[Orthog];
-
-    std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-    X.Checkerboard() = B.Checkerboard();
-    conformable(X, B);
-
-    Field tmp(B);
-    Field Q(B);
-    Field D(B);
-    Field Z(B);
-    Field AD(B);
-
-    Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-    Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-    Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-    Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-    Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-    Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-    Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-    Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-    Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-
-    // Initial residual computation & set up
-    std::vector<RealD> residuals(Nblock);
-    std::vector<RealD> ssq(Nblock);
-
-    sliceNorm(ssq,B,Orthog);
-    RealD sssum=0;
-    for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-    sliceNorm(residuals,B,Orthog);
-    for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-    sliceNorm(residuals,X,Orthog);
-    for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-    /************************************************************************
-     * Block Conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
-     ************************************************************************
-     * Dimensions:
-     *
-     *   X,B==(Nferm x Nblock)
-     *   A==(Nferm x Nferm)
-     *  
-     * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
-     * 
-     * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-     * for k: 
-     *   Z  = AD
-     *   M  = [D^dag Z]^{-1}
-     *   X  = X + D MC
-     *   QS = Q - ZM
-     *   D  = Q + D S^dag
-     *   C  = S C
-     */
-    ///////////////////////////////////////
-    // Initial block: initial search dir is guess
-    ///////////////////////////////////////
-    std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
-
-    //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
-
-    Linop.HermOp(X, AD);
-    tmp = B - AD;  
-    //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
-    ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
-    //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
-    //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
-    //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
-    //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
-    D=Q;
-
-    std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
-
-    ///////////////////////////////////////
-    // Timers
-    ///////////////////////////////////////
-    GridStopWatch sliceInnerTimer;
-    GridStopWatch sliceMaddTimer;
-    GridStopWatch QRTimer;
-    GridStopWatch MatrixTimer;
-    GridStopWatch SolverTimer;
-    SolverTimer.Start();
-
-    int k;
-    for (k = 1; k <= MaxIterations; k++) {
-
-      //3. Z  = AD
-      MatrixTimer.Start();
-      Linop.HermOp(D, Z);      
-      MatrixTimer.Stop();
-      //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
-
-      //4. M  = [D^dag Z]^{-1}
-      sliceInnerTimer.Start();
-      sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
-      sliceInnerTimer.Stop();
-      m_M       = m_DZ.inverse();
-      //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
-    
-      //5. X  = X + D MC
-      m_tmp     = m_M * m_C;
-      sliceMaddTimer.Start();
-      sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
-      sliceMaddTimer.Stop();
-
-      //6. QS = Q - ZM
-      sliceMaddTimer.Start();
-      sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
-      sliceMaddTimer.Stop();
-      QRTimer.Start();
-      ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
-      QRTimer.Stop();
-    
-      //7. D  = Q + D S^dag
-      m_tmp = m_S.adjoint();
-      sliceMaddTimer.Start();
-      sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
-      sliceMaddTimer.Stop();
-
-      //8. C  = S C
-      m_C = m_S*m_C;
-    
-      /*********************
-       * convergence monitor
-       *********************
-       */
-      m_rr = m_C.adjoint() * m_C;
-
-      RealD max_resid=0;
-      RealD rrsum=0;
-      RealD rr;
-
-      for(int b=0;b<Nblock;b++) {
-	rrsum+=real(m_rr(b,b));
-	rr    =real(m_rr(b,b))/ssq[b];
-	if ( rr > max_resid ) max_resid = rr;
-      }
-
-      std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-		<<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
-
-      if ( max_resid < Tolerance*Tolerance ) { 
-
-	SolverTimer.Stop();
-
-	std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
-
-	for(int b=0;b<Nblock;b++){
-	  std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		    << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-	}
-	std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-	Linop.HermOp(X, AD);
-	AD = AD-B;
-	std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
-
-	std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-	std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
-	    
-	IterationsToComplete = k;
-	return;
-      }
-
-    }
-    std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
-
-    if (ErrorOnNoConverge) assert(0);
-    IterationsToComplete = k;
-  }
-  //////////////////////////////////////////////////////////////////////////
-  // Block Conjugate gradient; Original O'Leary Dimension zero should be the block direction
-  //////////////////////////////////////////////////////////////////////////
-  void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-  {
-    int Orthog = blockDim; // First dimension is block dim; this is an assumption
-    Nblock = Src.Grid()->_fdimensions[Orthog];
-
-    std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-    Psi.Checkerboard() = Src.Checkerboard();
-    conformable(Psi, Src);
-
-    Field P(Src);
-    Field AP(Src);
-    Field R(Src);
-  
-    Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
-    Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
-    Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-    Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-    Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-    Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-    // Initial residual computation & set up
-    std::vector<RealD> residuals(Nblock);
-    std::vector<RealD> ssq(Nblock);
-
-    sliceNorm(ssq,Src,Orthog);
-    RealD sssum=0;
-    for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-    sliceNorm(residuals,Src,Orthog);
-    for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-    sliceNorm(residuals,Psi,Orthog);
-    for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-    // Initial search dir is guess
-    Linop.HermOp(Psi, AP);
-  
-
-    /************************************************************************
-     * Block Conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
-     ************************************************************************
-     * O'Leary : R = B - A X
-     * O'Leary : P = M R ; preconditioner M = 1
-     * O'Leary : alpha = PAP^{-1} RMR
-     * O'Leary : beta  = RMR^{-1}_old RMR_new
-     * O'Leary : X=X+Palpha
-     * O'Leary : R_new=R_old-AP alpha
-     * O'Leary : P=MR_new+P beta
-     */
-
-    R = Src - AP;  
-    P = R;
-    sliceInnerProductMatrix(m_rr,R,R,Orthog);
-
-    GridStopWatch sliceInnerTimer;
-    GridStopWatch sliceMaddTimer;
-    GridStopWatch MatrixTimer;
-    GridStopWatch SolverTimer;
-    SolverTimer.Start();
-
-    int k;
-    for (k = 1; k <= MaxIterations; k++) {
-
-      RealD rrsum=0;
-      for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
-
-      std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-		<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-      MatrixTimer.Start();
-      Linop.HermOp(P, AP);
-      MatrixTimer.Stop();
-
-      // Alpha
-      sliceInnerTimer.Start();
-      sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
-      sliceInnerTimer.Stop();
-      m_pAp_inv = m_pAp.inverse();
-      m_alpha   = m_pAp_inv * m_rr ;
-
-      // Psi, R update
-      sliceMaddTimer.Start();
-      sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-      sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-      sliceMaddTimer.Stop();
-
-      // Beta
-      m_rr_inv = m_rr.inverse();
-      sliceInnerTimer.Start();
-      sliceInnerProductMatrix(m_rr,R,R,Orthog);
-      sliceInnerTimer.Stop();
-      m_beta = m_rr_inv *m_rr;
-
-      // Search update
-      sliceMaddTimer.Start();
-      sliceMaddMatrix(AP,m_beta,P,R,Orthog);
-      sliceMaddTimer.Stop();
-      P= AP;
-
-      /*********************
-       * convergence monitor
-       *********************
-       */
-      RealD max_resid=0;
-      RealD rr;
-      for(int b=0;b<Nblock;b++){
-	rr = real(m_rr(b,b))/ssq[b];
-	if ( rr > max_resid ) max_resid = rr;
-      }
-    
-      if ( max_resid < Tolerance*Tolerance ) { 
-
-	SolverTimer.Stop();
-
-	std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
-	for(int b=0;b<Nblock;b++){
-	  std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
-		    << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
-	}
-	std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-	Linop.HermOp(Psi, AP);
-	AP = AP-Src;
-	std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-	std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-	    
-	IterationsToComplete = k;
-	return;
-      }
-
-    }
-    std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
-
-    if (ErrorOnNoConverge) assert(0);
-    IterationsToComplete = k;
-  }
-  //////////////////////////////////////////////////////////////////////////
-  // multiRHS Conjugate gradient. Dimension zero should be the block direction
-  // Use this for spread out across nodes
-  //////////////////////////////////////////////////////////////////////////
-  void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
-  {
-    int Orthog = blockDim; // First dimension is block dim
-    Nblock = Src.Grid()->_fdimensions[Orthog];
-
-    std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
-
-    Psi.Checkerboard() = Src.Checkerboard();
-    conformable(Psi, Src);
-
-    Field P(Src);
-    Field AP(Src);
-    Field R(Src);
-  
-    std::vector<ComplexD> v_pAp(Nblock);
-    std::vector<RealD> v_rr (Nblock);
-    std::vector<RealD> v_rr_inv(Nblock);
-    std::vector<RealD> v_alpha(Nblock);
-    std::vector<RealD> v_beta(Nblock);
-
-    // Initial residual computation & set up
-    std::vector<RealD> residuals(Nblock);
-    std::vector<RealD> ssq(Nblock);
-
-    sliceNorm(ssq,Src,Orthog);
-    RealD sssum=0;
-    for(int b=0;b<Nblock;b++) sssum+=ssq[b];
-
-    sliceNorm(residuals,Src,Orthog);
-    for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-    sliceNorm(residuals,Psi,Orthog);
-    for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
-
-    // Initial search dir is guess
-    Linop.HermOp(Psi, AP);
-
-    R = Src - AP;  
-    P = R;
-    sliceNorm(v_rr,R,Orthog);
-
-    GridStopWatch sliceInnerTimer;
-    GridStopWatch sliceMaddTimer;
-    GridStopWatch sliceNormTimer;
-    GridStopWatch MatrixTimer;
-    GridStopWatch SolverTimer;
-
-    SolverTimer.Start();
-    int k;
-    for (k = 1; k <= MaxIterations; k++) {
-
-      RealD rrsum=0;
-      for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
-
-      std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
-		<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
-
-      MatrixTimer.Start();
-      Linop.HermOp(P, AP);
-      MatrixTimer.Stop();
-
-      // Alpha
-      sliceInnerTimer.Start();
-      sliceInnerProductVector(v_pAp,P,AP,Orthog);
-      sliceInnerTimer.Stop();
-      for(int b=0;b<Nblock;b++){
-	v_alpha[b] = v_rr[b]/real(v_pAp[b]);
-      }
-
-      // Psi, R update
-      sliceMaddTimer.Start();
-      sliceMaddVector(Psi,v_alpha, P,Psi,Orthog);     // add alpha *  P to psi
-      sliceMaddVector(R  ,v_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid
-      sliceMaddTimer.Stop();
-
-      // Beta
-      for(int b=0;b<Nblock;b++){
-	v_rr_inv[b] = 1.0/v_rr[b];
-      }
-      sliceNormTimer.Start();
-      sliceNorm(v_rr,R,Orthog);
-      sliceNormTimer.Stop();
-      for(int b=0;b<Nblock;b++){
-	v_beta[b] = v_rr_inv[b] *v_rr[b];
-      }
-
-      // Search update
-      sliceMaddTimer.Start();
-      sliceMaddVector(P,v_beta,P,R,Orthog);
-      sliceMaddTimer.Stop();
-
-      /*********************
-       * convergence monitor
-       *********************
-       */
-      RealD max_resid=0;
-      for(int b=0;b<Nblock;b++){
-	RealD rr = v_rr[b]/ssq[b];
-	if ( rr > max_resid ) max_resid = rr;
-      }
-    
-      if ( max_resid < Tolerance*Tolerance ) { 
-
-	SolverTimer.Stop();
-
-	std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
-	for(int b=0;b<Nblock;b++){
-	  std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
-	}
-	std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
-
-	Linop.HermOp(Psi, AP);
-	AP = AP-Src;
-	std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
-
-	std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
-	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
-	std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tNorm       " << sliceNormTimer.Elapsed() <<std::endl;
-	std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
-
-
-	IterationsToComplete = k;
-	return;
-      }
-
-    }
-    std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
-
-    if (ErrorOnNoConverge) assert(0);
-    IterationsToComplete = k;
-  }
-
-};
-
-NAMESPACE_END(Grid);
-#endif
diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h
deleted file mode 100644
index a915d8d0..00000000
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/SchurRedBlack.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_SCHUR_RED_BLACK_H
-#define GRID_SCHUR_RED_BLACK_H
-
-
-/*
- * Red black Schur decomposition
- *
- *  M = (Mee Meo) =  (1             0 )   (Mee   0               )  (1 Mee^{-1} Meo)
- *      (Moe Moo)    (Moe Mee^-1    1 )   (0   Moo-Moe Mee^-1 Meo)  (0   1         )
- *                =         L                     D                     U
- *
- * L^-1 = (1              0 )
- *        (-MoeMee^{-1}   1 )   
- * L^{dag} = ( 1       Mee^{-dag} Moe^{dag} )
- *           ( 0       1                    )
- * L^{-d}  = ( 1      -Mee^{-dag} Moe^{dag} )
- *           ( 0       1                    )
- *
- * U^-1 = (1   -Mee^{-1} Meo)
- *        (0    1           )
- * U^{dag} = ( 1                 0)
- *           (Meo^dag Mee^{-dag} 1)
- * U^{-dag} = (  1                 0)
- *            (-Meo^dag Mee^{-dag} 1)
- ***********************
- *     M psi = eta
- ***********************
- *Odd
- * i)                 D_oo psi_o =  L^{-1}  eta_o
- *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
- *
- * Wilson:
- *      (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
- * Stag:
- *      D_oo psi_o = L^{-1}  eta =    (eta_o - Moe Mee^{-1} eta_e)
- *
- * L^-1 eta_o= (1              0 ) (e
- *             (-MoeMee^{-1}   1 )   
- *
- *Even
- * ii)  Mee psi_e + Meo psi_o = src_e
- *
- *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
- *
- * 
- * TODO: Other options:
- * 
- * a) change checkerboards for Schur e<->o
- *
- * Left precon by Moo^-1
- * b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 =  (D_oo)^dag M_oo^-dag Moo^-1 L^{-1}  eta_o
- *                              eta_o'     = (D_oo)^dag  M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
- *
- * Right precon by Moo^-1
- * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1}  eta_o
- *                              eta_o'     = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
- *                              psi_o = M_oo^-1 phi_o
- * TODO: Deflation 
- */
-NAMESPACE_BEGIN(Grid);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Take a matrix and form a Red Black solver calling a Herm solver
-// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Now make the norm reflect extra factor of Mee
-template<class Field> class SchurRedBlackStaggeredSolve {
-private:
-  OperatorFunction<Field> & _HermitianRBSolver;
-  int CBfactorise;
-public:
-
-  /////////////////////////////////////////////////////
-  // Wrap the usual normal equations Schur trick
-  /////////////////////////////////////////////////////
-  SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver)  :
-    _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=0;
-  };
-
-  template<class Matrix>
-  void operator() (Matrix & _Matrix,const Field &in, Field &out){
-
-    // FIXME CGdiagonalMee not implemented virtual function
-    // FIXME use CBfactorise to control schur decomp
-    GridBase *grid = _Matrix.RedBlackGrid();
-    GridBase *fgrid= _Matrix.Grid();
-
-    SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-    Field src_e(grid);
-    Field src_o(grid);
-    Field sol_e(grid);
-    Field sol_o(grid);
-    Field   tmp(grid);
-    Field  Mtmp(grid);
-    Field resid(fgrid);
-      
-    std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
-    pickCheckerboard(Even,src_e,in);
-    pickCheckerboard(Odd ,src_o,in);
-    pickCheckerboard(Even,sol_e,out);
-    pickCheckerboard(Odd ,sol_o,out);
-
-    std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
-    
-    /////////////////////////////////////////////////////
-    // src_o = (source_o - Moe MeeInv source_e)
-    /////////////////////////////////////////////////////
-    _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-    _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-    tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
-
-    //src_o = tmp;     assert(src_o.Checkerboard() ==Odd);
-    _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
-
-    //////////////////////////////////////////////////////////////
-    // Call the red-black solver
-    //////////////////////////////////////////////////////////////
-    std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
-    _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
-    std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
-
-    ///////////////////////////////////////////////////
-    // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-    ///////////////////////////////////////////////////
-    _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
-    src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
-    _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
-     
-    std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
-    setCheckerboard(out,sol_e); assert(  sol_e.Checkerboard() ==Even);
-    setCheckerboard(out,sol_o); assert(  sol_o.Checkerboard() ==Odd );
-    std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
-
-    // Verify the unprec residual
-    _Matrix.M(out,resid); 
-    resid = resid-in;
-    RealD ns = norm2(in);
-    RealD nr = norm2(resid);
-    std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-  }     
-};
-template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Take a matrix and form a Red Black solver calling a Herm solver
-// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class SchurRedBlackDiagMooeeSolve {
-private:
-  OperatorFunction<Field> & _HermitianRBSolver;
-  int CBfactorise;
-public:
-
-  /////////////////////////////////////////////////////
-  // Wrap the usual normal equations Schur trick
-  /////////////////////////////////////////////////////
-  SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0)  :  _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=cb;
-  };
-  template<class Matrix>
-  void operator() (Matrix & _Matrix,const Field &in, Field &out){
-
-    // FIXME CGdiagonalMee not implemented virtual function
-    // FIXME use CBfactorise to control schur decomp
-    GridBase *grid = _Matrix.RedBlackGrid();
-    GridBase *fgrid= _Matrix.Grid();
-
-    SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-    Field src_e(grid);
-    Field src_o(grid);
-    Field sol_e(grid);
-    Field sol_o(grid);
-    Field   tmp(grid);
-    Field  Mtmp(grid);
-    Field resid(fgrid);
-
-    pickCheckerboard(Even,src_e,in);
-    pickCheckerboard(Odd ,src_o,in);
-    pickCheckerboard(Even,sol_e,out);
-    pickCheckerboard(Odd ,sol_o,out);
-    
-    /////////////////////////////////////////////////////
-    // src_o = Mdag * (source_o - Moe MeeInv source_e)
-    /////////////////////////////////////////////////////
-    _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-    _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-    tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
-
-    // get the right MpcDag
-    _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
-
-    //////////////////////////////////////////////////////////////
-    // Call the red-black solver
-    //////////////////////////////////////////////////////////////
-    std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-    _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
-
-    ///////////////////////////////////////////////////
-    // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-    ///////////////////////////////////////////////////
-    _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
-    src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
-    _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
-     
-    setCheckerboard(out,sol_e); assert(  sol_e.Checkerboard() ==Even);
-    setCheckerboard(out,sol_o); assert(  sol_o.Checkerboard() ==Odd );
-
-    // Verify the unprec residual
-    _Matrix.M(out,resid); 
-    resid = resid-in;
-    RealD ns = norm2(in);
-    RealD nr = norm2(resid);
-
-    std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-  }     
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Take a matrix and form a Red Black solver calling a Herm solver
-// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class SchurRedBlackDiagTwoSolve {
-private:
-  OperatorFunction<Field> & _HermitianRBSolver;
-  int CBfactorise;
-public:
-
-  /////////////////////////////////////////////////////
-  // Wrap the usual normal equations Schur trick
-  /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver)  :
-    _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=0;
-  };
-
-  template<class Matrix>
-  void operator() (Matrix & _Matrix,const Field &in, Field &out){
-
-    // FIXME CGdiagonalMee not implemented virtual function
-    // FIXME use CBfactorise to control schur decomp
-    GridBase *grid = _Matrix.RedBlackGrid();
-    GridBase *fgrid= _Matrix.Grid();
-
-    SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-    Field src_e(grid);
-    Field src_o(grid);
-    Field sol_e(grid);
-    Field sol_o(grid);
-    Field   tmp(grid);
-    Field  Mtmp(grid);
-    Field resid(fgrid);
-
-    pickCheckerboard(Even,src_e,in);
-    pickCheckerboard(Odd ,src_o,in);
-    pickCheckerboard(Even,sol_e,out);
-    pickCheckerboard(Odd ,sol_o,out);
-    
-    /////////////////////////////////////////////////////
-    // src_o = Mdag * (source_o - Moe MeeInv source_e)
-    /////////////////////////////////////////////////////
-    _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-    _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-    tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
-
-    // get the right MpcDag
-    _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
-
-    //////////////////////////////////////////////////////////////
-    // Call the red-black solver
-    //////////////////////////////////////////////////////////////
-    std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-    //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
-    _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.Checkerboard()==Odd);
-    _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.Checkerboard()   ==Odd);
-
-    ///////////////////////////////////////////////////
-    // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-    ///////////////////////////////////////////////////
-    _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
-    src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
-    _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
-     
-    setCheckerboard(out,sol_e); assert(  sol_e.Checkerboard() ==Even);
-    setCheckerboard(out,sol_o); assert(  sol_o.Checkerboard() ==Odd );
-
-    // Verify the unprec residual
-    _Matrix.M(out,resid); 
-    resid = resid-in;
-    RealD ns = norm2(in);
-    RealD nr = norm2(resid);
-
-    std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-  }     
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Take a matrix and form a Red Black solver calling a Herm solver
-// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class Field> class SchurRedBlackDiagTwoMixed {
-private:
-  LinearFunction<Field> & _HermitianRBSolver;
-  int CBfactorise;
-public:
-
-  /////////////////////////////////////////////////////
-  // Wrap the usual normal equations Schur trick
-  /////////////////////////////////////////////////////
-  SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver)  :
-    _HermitianRBSolver(HermitianRBSolver) 
-  { 
-    CBfactorise=0;
-  };
-
-  template<class Matrix>
-  void operator() (Matrix & _Matrix,const Field &in, Field &out){
-
-    // FIXME CGdiagonalMee not implemented virtual function
-    // FIXME use CBfactorise to control schur decomp
-    GridBase *grid = _Matrix.RedBlackGrid();
-    GridBase *fgrid= _Matrix.Grid();
-
-    SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
- 
-    Field src_e(grid);
-    Field src_o(grid);
-    Field sol_e(grid);
-    Field sol_o(grid);
-    Field   tmp(grid);
-    Field  Mtmp(grid);
-    Field resid(fgrid);
-
-    pickCheckerboard(Even,src_e,in);
-    pickCheckerboard(Odd ,src_o,in);
-    pickCheckerboard(Even,sol_e,out);
-    pickCheckerboard(Odd ,sol_o,out);
-    
-    /////////////////////////////////////////////////////
-    // src_o = Mdag * (source_o - Moe MeeInv source_e)
-    /////////////////////////////////////////////////////
-    _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.Checkerboard() ==Even);
-    _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.Checkerboard() ==Odd);     
-    tmp=src_o-Mtmp;                  assert(  tmp.Checkerboard() ==Odd);     
-
-    // get the right MpcDag
-    _HermOpEO.MpcDag(tmp,src_o);     assert(src_o.Checkerboard() ==Odd);       
-
-    //////////////////////////////////////////////////////////////
-    // Call the red-black solver
-    //////////////////////////////////////////////////////////////
-    std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
-    //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.Checkerboard()==Odd);
-    //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.Checkerboard()==Odd);
-    _HermitianRBSolver(src_o,tmp);  assert(tmp.Checkerboard()==Odd);
-    _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.Checkerboard()   ==Odd);
-
-    ///////////////////////////////////////////////////
-    // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
-    ///////////////////////////////////////////////////
-    _Matrix.Meooe(sol_o,tmp);        assert(  tmp.Checkerboard()   ==Even);
-    src_e = src_e-tmp;               assert(  src_e.Checkerboard() ==Even);
-    _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.Checkerboard() ==Even);
-     
-    setCheckerboard(out,sol_e); assert(  sol_e.Checkerboard() ==Even);
-    setCheckerboard(out,sol_o); assert(  sol_o.Checkerboard() ==Odd );
-
-    // Verify the unprec residual
-    _Matrix.M(out,resid); 
-    resid = resid-in;
-    RealD ns = norm2(in);
-    RealD nr = norm2(resid);
-
-    std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
-  }     
-};
-
-NAMESPACE_END(Grid);
-#endif
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
deleted file mode 100644
index 6c863afc..00000000
--- a/lib/lattice/Lattice_reduction.h
+++ /dev/null
@@ -1,383 +0,0 @@
-/*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid 
-    Source file: ./lib/lattice/Lattice_reduction.h
-    Copyright (C) 2015
-Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_LATTICE_REDUCTION_H
-#define GRID_LATTICE_REDUCTION_H
-
-#ifdef GRID_WARN_SUBOPTIMAL
-#warning "Optimisation alert all these reduction loops are NOT threaded "
-#endif     
-
-NAMESPACE_BEGIN(Grid);
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Deterministic Reduction operations
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
-  ComplexD nrm = innerProduct(arg,arg);
-  return real(nrm); 
-}
-
-// Double inner product
-template<class vobj>
-inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
-{
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_typeD vector_type;
-  scalar_type  nrm;
-  
-  GridBase *grid = left.Grid();
-  
-  std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
-
-  auto left_v = left.View();
-  auto right_v=right.View();
-
-  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
-    int mywork, myoff;
-    GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
-    
-    decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation
-    for(int ss=myoff;ss<mywork+myoff; ss++){
-      vnrm = vnrm + innerProductD(left_v[ss],right_v[ss]);
-    }
-    sumarray[thr]=TensorRemove(vnrm) ;
-  });
-  
-  vector_type vvnrm; vvnrm=Zero();  // sum across threads
-  for(int i=0;i<grid->SumArraySize();i++){
-    vvnrm = vvnrm+sumarray[i];
-  } 
-  nrm = Reduce(vvnrm);// sum across simd
-  right.Grid()->GlobalSum(nrm);
-  return nrm;
-}
- 
-template<class Op,class T1>
-inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
-  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
-{
-  return sum(closure(expr));
-}
-
-template<class Op,class T1,class T2>
-inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
-  ->typename decltype(expr.op.func(eval(0,expr.arg1,eval(0,expr.arg2))))::scalar_object
-{
-  return sum(closure(expr));
-}
-
-
-template<class Op,class T1,class T2,class T3>
-inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-  ->typename decltype(expr.op.func(eval(0,expr.arg1),
-				   eval(0,expr.arg2),
-				   eval(0,expr.arg3)
-				   ))::scalar_object
-{
-  return sum(closure(expr));
-}
-
-template<class vobj>
-inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
-{
-  GridBase *grid=arg.Grid();
-  int Nsimd = grid->Nsimd();
-  
-  std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
-  for(int i=0;i<grid->SumArraySize();i++){
-    sumarray[i]=Zero();
-  }
-  auto arg_v = arg.View();
-  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
-    int mywork, myoff;
-    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
-    
-    vobj vvsum=Zero();
-    for(int ss=myoff;ss<mywork+myoff; ss++){
-      vvsum = vvsum + arg_v[ss];
-    }
-    sumarray[thr]=vvsum;
-  });
-  
-  vobj vsum=Zero();  // sum across threads
-  for(int i=0;i<grid->SumArraySize();i++){
-    vsum = vsum+sumarray[i];
-  } 
-  
-  typedef typename vobj::scalar_object sobj;
-  sobj ssum; zeroit(ssum);
-  
-  ExtractBuffer<sobj>               buf(Nsimd);
-  extract(vsum,buf);
-  
-  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
-  arg.Grid()->GlobalSum(ssum);
-  
-  return ssum;
-}
-
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
-{
-  ///////////////////////////////////////////////////////
-  // FIXME precision promoted summation
-  // may be important for correlation functions
-  // But easily avoided by using double precision fields
-  ///////////////////////////////////////////////////////
-  typedef typename vobj::scalar_object sobj;
-  GridBase  *grid = Data.Grid();
-  assert(grid!=NULL);
-
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-
-  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
-  std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
-  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
-
-  result.resize(fd); // And then global sum to return the same vector to every node 
-  for(int r=0;r<rd;r++){
-    lvSum[r]=Zero();
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  // sum over reduced dimension planes, breaking out orthog dir
-  auto Data_v = Data.View();
-  thread_loop( (int r=0;r<rd;r++),{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-	int ss= so+n*stride+b;
-	lvSum[r]=lvSum[r]+Data_v[ss];
-      }
-    }
-  });
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  Coordinate icoor(Nd);
-
-  for(int rt=0;rt<rd;rt++){
-
-    extract(lvSum[rt],extracted);
-
-    for(int idx=0;idx<Nsimd;idx++){
-
-      grid->iCoorFromIindex(icoor,idx);
-
-      int ldx =rt+icoor[orthogdim]*rd;
-
-      lsSum[ldx]=lsSum[ldx]+extracted[idx];
-
-    }
-  }
-  
-  // sum over nodes.
-  sobj gsum;
-  for(int t=0;t<fd;t++){
-    int pt = t/ld; // processor plane
-    int lt = t%ld;
-    if ( pt == grid->_processor_coor[orthogdim] ) {
-      gsum=lsSum[lt];
-    } else {
-      gsum=Zero();
-    }
-
-    grid->GlobalSum(gsum);
-
-    result[t]=gsum;
-  }
-}
-
-template<class vobj>
-static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
-{
-  typedef typename vobj::vector_type   vector_type;
-  typedef typename vobj::scalar_type   scalar_type;
-  GridBase  *grid = lhs.Grid();
-  assert(grid!=NULL);
-  conformable(grid,rhs.Grid());
-
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-
-  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
-  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
-  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD
-
-  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
-  for(int r=0;r<rd;r++){
-    lvSum[r]=Zero();
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  auto lhs_v = lhs.View();
-  auto rhs_v = rhs.View();
-  thread_loop( (int r=0;r<rd;r++),{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-	int ss= so+n*stride+b;
-	vector_type vv = TensorRemove(innerProduct(lhs_v[ss],rhs_v[ss]));
-	lvSum[r]=lvSum[r]+vv;
-      }
-    }
-  });
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  Coordinate icoor(Nd);
-  for(int rt=0;rt<rd;rt++){
-
-    iScalar<vector_type> temp; 
-    temp._internal = lvSum[rt];
-    extract(temp,extracted);
-
-    for(int idx=0;idx<Nsimd;idx++){
-
-      grid->iCoorFromIindex(icoor,idx);
-
-      int ldx =rt+icoor[orthogdim]*rd;
-
-      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
-
-    }
-  }
-  
-  // sum over nodes.
-  scalar_type gsum;
-  for(int t=0;t<fd;t++){
-    int pt = t/ld; // processor plane
-    int lt = t%ld;
-    if ( pt == grid->_processor_coor[orthogdim] ) {
-      gsum=lsSum[lt];
-    } else {
-      gsum=scalar_type(0.0);
-    }
-
-    grid->GlobalSum(gsum);
-
-    result[t]=gsum;
-  }
-}
-template<class vobj>
-static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
-  std::vector<ComplexD> ip(Nblock);
-  sn.resize(Nblock);
-  
-  sliceInnerProductVector(ip,rhs,rhs,Orthog);
-  for(int ss=0;ss<Nblock;ss++){
-    sn[ss] = real(ip[ss]);
-  }
-};
-
-
-template<class vobj>
-static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
-			    int orthogdim,RealD scale=1.0) 
-{    
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::tensor_reduced tensor_reduced;
-  
-  scalar_type zscale(scale);
-
-  GridBase *grid  = X.Grid();
-
-  int Nsimd  =grid->Nsimd();
-  int Nblock =grid->GlobalDimensions()[orthogdim];
-
-  int fd     =grid->_fdimensions[orthogdim];
-  int ld     =grid->_ldimensions[orthogdim];
-  int rd     =grid->_rdimensions[orthogdim];
-
-  int e1     =grid->_slice_nblock[orthogdim];
-  int e2     =grid->_slice_block [orthogdim];
-  int stride =grid->_slice_stride[orthogdim];
-
-  Coordinate icoor;
-
-  for(int r=0;r<rd;r++){
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    vector_type    av;
-
-    for(int l=0;l<Nsimd;l++){
-      grid->iCoorFromIindex(icoor,l);
-      int ldx =r+icoor[orthogdim]*rd;
-      scalar_type *as =(scalar_type *)&av;
-      as[l] = scalar_type(a[ldx])*zscale;
-    }
-
-    tensor_reduced at; at=av;
-
-    auto X_v = X.View();
-    auto Y_v = Y.View();
-    auto R_v = R.View();
-    thread_loop_collapse2( (int n=0;n<e1;n++),{
-      for(int b=0;b<e2;b++){
-	int ss= so+n*stride+b;
-	R_v[ss] = at*X_v[ss]+Y_v[ss];
-      }
-    });
-  }
-};
-
-NAMESPACE_END(Grid);
-#endif
-
-
-
diff --git a/lib/pugixml/README.md b/lib/pugixml/README.md
deleted file mode 100644
index 9d8a935f..00000000
--- a/lib/pugixml/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-pugixml [![Build Status](https://travis-ci.org/zeux/pugixml.svg?branch=master)](https://travis-ci.org/zeux/pugixml) [![Build status](https://ci.appveyor.com/api/projects/status/9hdks1doqvq8pwe7/branch/master?svg=true)](https://ci.appveyor.com/project/zeux/pugixml)
-=======
-
-pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification
-capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0
-implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface
-variants and conversions between different Unicode encodings (which happen automatically during parsing/saving).
-
-pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface.
-
-## Documentation
-
-Documentation for the current release of pugixml is available on-line as two separate documents:
-
-* [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library;
-* [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail.
-
-You’re advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual.
-
-## License
-This library is available to anybody free of charge, under the terms of MIT License:
-
-Copyright (c) 2006-2015 Arseny Kapoulkine
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc
deleted file mode 100644
index 0dcb4ff7..00000000
--- a/lib/qcd/action/fermion/StaggeredKernels.cc
+++ /dev/null
@@ -1,275 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-NAMESPACE_BEGIN(Grid);
-
-int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
-
-template <class Impl>
-StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
-
-////////////////////////////////////////////
-// Generic implementation; move to different file?
-////////////////////////////////////////////
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,
-					   SiteSpinor *buf, int sF,
-					   int sU, const FermionFieldView &in, SiteSpinor &out,int threeLink) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
-  StencilEntry *SE;
-  int ptype;
-  int skew = 0;
-  if (threeLink) skew=8;
-  ///////////////////////////
-  // Xp
-  ///////////////////////////
-
-  SE = st.GetEntry(ptype, Xp+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLink(Uchi, U[sU], *chi_p, Xp);
-
-  ///////////////////////////
-  // Yp
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Yp+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Yp);
-
-  ///////////////////////////
-  // Zp
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Zp+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Zp);
-
-  ///////////////////////////
-  // Tp
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Tp+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Tp);
-
-  ///////////////////////////
-  // Xm
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Xm+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Xm);
-
-  ///////////////////////////
-  // Ym
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Ym+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Ym);
-
-  ///////////////////////////
-  // Zm
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Zm+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Zm);
-
-  ///////////////////////////
-  // Tm
-  ///////////////////////////
-  SE = st.GetEntry(ptype, Tm+skew, sF);
-  if (SE->_is_local) {
-    if (SE->_permute) {
-      chi_p = &chi;
-      permute(chi,  in[SE->_offset], ptype);
-    } else {
-      chi_p = &in[SE->_offset];
-    }
-  } else {
-    chi_p = &buf[SE->_offset];
-  }
-  Impl::multLinkAdd(Uchi, U[sU], *chi_p, Tm);
-
-  vstream(out, Uchi);
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-					 SiteSpinor *buf, int LLs, int sU,
-					 const FermionFieldView &in, FermionFieldView &out) {
-  SiteSpinor naik;
-  SiteSpinor naive;
-  int oneLink  =0;
-  int threeLink=1;
-  int dag=1;
-  switch(Opt) {
-#ifdef AVX512
-    //FIXME; move the sign into the Asm routine
-  case OptInlineAsm:
-    DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
-    for(int s=0;s<LLs;s++) {
-      int sF=s+LLs*sU;
-      out[sF]=-out[sF];
-    }
-    break;
-#endif
-  case OptHandUnroll:
-    DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    break;
-  case OptGeneric:
-    for(int s=0;s<LLs;s++){
-      int sF=s+LLs*sU;
-      DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
-      DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
-      out[sF] =-naive-naik; 
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
-				      SiteSpinor *buf, int LLs,
-				      int sU, const FermionFieldView &in, FermionFieldView &out) 
-{
-  int oneLink  =0;
-  int threeLink=1;
-  SiteSpinor naik;
-  SiteSpinor naive;
-  int dag=0;
-  switch(Opt) {
-#ifdef AVX512
-  case OptInlineAsm:
-    DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
-    break;
-#endif
-  case OptHandUnroll:
-    DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
-    break;
-  case OptGeneric:
-    for(int s=0;s<LLs;s++){
-      int sF=LLs*sU+s;
-      //      assert(sF<in._odata.size());
-      //      assert(sU< U._odata.size());
-      //      assert(sF>=0);      assert(sU>=0);
-      DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
-      DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
-      out[sF] =naive+naik;
-    }
-    break;
-  default:
-    std::cout<<"Oops Opt = "<<Opt<<std::endl;
-    assert(0);
-    break;
-  }
-};
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopDirK( StencilImpl &st, DoubledGaugeFieldView &U,  DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
-				      int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp) 
-{
-  // Disp should be either +1,-1,+3,-3
-  // What about "dag" ?
-  // Because we work out pU . dS/dU 
-  // U
-  assert(0);
-}
-
-FermOpStaggeredTemplateInstantiate(StaggeredKernels);
-FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
-
-NAMESPACE_END(Grid);
-
diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h
deleted file mode 100644
index 140682a6..00000000
--- a/lib/qcd/action/fermion/StaggeredKernels.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/qcd/action/fermion/StaggeredKernels.h
-
-Copyright (C) 2015
-
-Author: Azusa Yamaguchi, Peter Boyle
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-			   /*  END LEGAL */
-#ifndef GRID_QCD_STAGGERED_KERNELS_H
-#define GRID_QCD_STAGGERED_KERNELS_H
-
-NAMESPACE_BEGIN(Grid);
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Helper routines that implement Staggered stencil for a single site.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class StaggeredKernelsStatic { 
-public:
-  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
-  // S-direction is INNERMOST and takes no part in the parity.
-  static int Opt;  // these are a temporary hack
-};
- 
-template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic { 
-public:
-   
-  INHERIT_IMPL_TYPES(Impl);
-  typedef FermionOperator<Impl> Base;
-   
-public:
-    
-  void DhopDirK(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-	       int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
-
-  void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, SiteSpinor * buf,
-		     int sF, int sU, const FermionFieldView &in, SiteSpinor &out,int threeLink);
-
-
-  void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, SiteSpinor * buf,
-			 int sF, int sU, const FermionFieldView &in, SiteSpinor&out,int threeLink);
-
-  void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,SiteSpinor * buf,
-		    int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag);
-
-  void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-		   int LLs, int sU, const FermionFieldView &in, FermionFieldView &out);
-      
-  void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-		int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
-
-  void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, 
-                   int LLs, int sU, const FermionFieldView &in, FermionFieldView &out);
-  
-public:
-
-  StaggeredKernels(const ImplParams &p = ImplParams());
-
-};
-    
-NAMESPACE_END(Grid);
-
-#endif
diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc
deleted file mode 100644
index 5e18f0ab..00000000
--- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <Grid.h>
-
-#define REGISTER
-
-#define LOAD_CHI(b)				\
-  const SiteSpinor & ref (b[offset]);		\
-  Chi_0=ref()()(0);				\
-  Chi_1=ref()()(1);				\
-  Chi_2=ref()()(2);
-
-
-// To splat or not to splat depends on the implementation
-#define MULT(A,UChi)				\
-  auto & ref(U[sU](A));			\
-  Impl::loadLinkElement(U_00,ref()(0,0));	\
-  Impl::loadLinkElement(U_10,ref()(1,0));	\
-  Impl::loadLinkElement(U_20,ref()(2,0));	\
-  Impl::loadLinkElement(U_01,ref()(0,1));	\
-  Impl::loadLinkElement(U_11,ref()(1,1));	\
-  Impl::loadLinkElement(U_21,ref()(2,1));	\
-  Impl::loadLinkElement(U_02,ref()(0,2));	\
-  Impl::loadLinkElement(U_12,ref()(1,2));	\
-  Impl::loadLinkElement(U_22,ref()(2,2));	\
-  UChi ## _0  = U_00*Chi_0;			\
-  UChi ## _1  = U_10*Chi_0;			\
-  UChi ## _2  = U_20*Chi_0;			\
-  UChi ## _0 += U_01*Chi_1;			\
-  UChi ## _1 += U_11*Chi_1;			\
-  UChi ## _2 += U_21*Chi_1;			\
-  UChi ## _0 += U_02*Chi_2;			\
-  UChi ## _1 += U_12*Chi_2;			\
-  UChi ## _2 += U_22*Chi_2;
-
-#define MULT_ADD(A,UChi)			\
-  auto & ref(U[sU](A));			\
-  Impl::loadLinkElement(U_00,ref()(0,0));	\
-  Impl::loadLinkElement(U_10,ref()(1,0));	\
-  Impl::loadLinkElement(U_20,ref()(2,0));	\
-  Impl::loadLinkElement(U_01,ref()(0,1));	\
-  Impl::loadLinkElement(U_11,ref()(1,1));	\
-  Impl::loadLinkElement(U_21,ref()(2,1));	\
-  Impl::loadLinkElement(U_02,ref()(0,2));	\
-  Impl::loadLinkElement(U_12,ref()(1,2));	\
-  Impl::loadLinkElement(U_22,ref()(2,2));	\
-  UChi ## _0 += U_00*Chi_0;			\
-  UChi ## _1 += U_10*Chi_0;			\
-  UChi ## _2 += U_20*Chi_0;			\
-  UChi ## _0 += U_01*Chi_1;			\
-  UChi ## _1 += U_11*Chi_1;			\
-  UChi ## _2 += U_21*Chi_1;			\
-  UChi ## _0 += U_02*Chi_2;			\
-  UChi ## _1 += U_12*Chi_2;			\
-  UChi ## _2 += U_22*Chi_2;
-
-
-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-NAMESPACE_BEGIN(Grid);
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
-					  SiteSpinor *buf, int LLs,
-					  int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
-{
-  SiteSpinor naik; 
-  SiteSpinor naive;
-  int oneLink  =0;
-  int threeLink=1;
-  Real scale(1.0);
-  
-  if(dag) scale = -1.0;
-  
-  for(int s=0;s<LLs;s++){
-    int sF=s+LLs*sU;
-    DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
-    DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
-    out[sF] =scale*(naive+naik);
-  }
-}
-
-template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,
-					       SiteSpinor *buf, int sF,
-					       int sU, const FermionFieldView &in, SiteSpinor &out,int threeLink) 
-{
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd even_0; // 12 regs on knc
-  REGISTER Simd even_1;
-  REGISTER Simd even_2;
-  REGISTER Simd odd_0; // 12 regs on knc
-  REGISTER Simd odd_1;
-  REGISTER Simd odd_2;
-
-  REGISTER Simd Chi_0;    // two spinor; 6 regs
-  REGISTER Simd Chi_1;
-  REGISTER Simd Chi_2;
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-  REGISTER Simd U_02;
-  REGISTER Simd U_12;
-  REGISTER Simd U_22; 
-
-  int skew = 0;
-  if (threeLink) skew=8;
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT(Xp,even);
-  }
-  
-  // Yp
-  SE=st.GetEntry(ptype,Yp+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT(Yp,odd);
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT_ADD(Zp,even);
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT_ADD(Tp,odd);
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT_ADD(Xm,even);
-  }
-  
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT_ADD(Ym,odd);
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT_ADD(Zm,even);
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm+skew,sF);
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  if ( local ) {
-    LOAD_CHI((&in[0]));
-    if ( perm) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(buf);
-  }
-  {
-    MULT_ADD(Tm,odd);
-  }
-
-  vstream(out()()(0),even_0+odd_0);
-  vstream(out()()(1),even_1+odd_1);
-  vstream(out()()(2),even_2+odd_2);
-
-}
-
-#define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\
-  template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
-						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
-						     SiteSpinor *buf, int LLs, \
-						     int sU, const FermionFieldView &in, FermionFieldView &out, int dag);
-
-#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL)				\
-  template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, \
-							  SiteSpinor *buf, int sF, \
-							  int sU, const FermionFieldView &in, SiteSpinor &out,int threeLink) ;
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
-
-DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
-DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
-DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
-DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
-
-NAMESPACE_END(Grid);
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
deleted file mode 100644
index 14db2e36..00000000
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*************************************************************************************
-
-  Grid physics library, www.github.com/paboyle/Grid
-
-  Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
-
-  Copyright (C) 2015
-
-  Author: Guido Cossu <guido,cossu@ed.ac.uk>
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License along
-  with this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-  See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-
-#ifndef SCALAR_INT_ACTION_H
-#define SCALAR_INT_ACTION_H
-
-// Note: this action can completely absorb the ScalarAction for real float fields
-// use the scalarObjs to generalise the structure
-
-NAMESPACE_BEGIN(Grid);
-
-template <class Impl, int Ndim >
-class ScalarInteractionAction : public Action<typename Impl::Field> {
-public:
-  INHERIT_FIELD_TYPES(Impl);
-private:
-  RealD mass_square;
-  RealD lambda;
-
-
-  typedef typename Field::vector_object vobj;
-  typedef CartesianStencil<vobj,vobj> Stencil;
-
-  SimpleCompressor<vobj> compressor;
-  int npoint = 2*Ndim;
-  std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
-  std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
-
-
-public:
-
-  ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
-    for (int mu = 0 ; mu < Ndim; mu++){
-      directions[mu]         = mu; directions[mu+Ndim]    = mu;
-      displacements[mu]      =  1; displacements[mu+Ndim] = -1;
-    }
-  }
-
-  virtual std::string LogParameters() {
-    std::stringstream sstream;
-    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
-    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
-    return sstream.str();
-  }
-
-  virtual std::string action_name() {return "ScalarAction";}
-
-  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
-
-  virtual RealD S(const Field &p) {
-    assert(p.Grid()->Nd() == Ndim);
-    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
-    phiStencil.HaloExchange(p, compressor);
-    Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid());
-    auto action_v = action.View();
-    phisquared = p*p;
-    action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
-    for (int mu = 0; mu < Ndim; mu++) {
-      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-      auto p_v = p.View();
-      thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{
-	int permute_type;
-	StencilEntry *SE;
-	vobj temp2;
-	const vobj *temp, *t_p;
-	    
-	SE = phiStencil.GetEntry(permute_type, mu, i);
-	t_p  = &p_v[i];
-	if ( SE->_is_local ) {
-	  temp = &p_v[SE->_offset];
-	  if ( SE->_permute ) {
-	    permute(temp2, *temp, permute_type);
-	    action_v[i] -= temp2*(*t_p) + (*t_p)*temp2;
-	  } else {
-	    action_v[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
-	  }
-	} else {
-	  action_v[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
-	}
-      });
-      //  action -= pshift*p + p*pshift;
-    }
-    // NB the trace in the algebra is normalised to 1/2
-    // minus sign coming from the antihermitian fields
-    return -(TensorRemove(sum(trace(action)))).real();
-  };
-
-  virtual void deriv(const Field &p, Field &force) {
-    assert(p.Grid()->Nd() == Ndim);
-    force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-    // move this outside
-    static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
-    phiStencil.HaloExchange(p, compressor);
-      
-    //for (int mu = 0; mu < Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-    for (int point = 0; point < npoint; point++) {
-      auto force_v = force.View();
-      auto p_v     = p.View();
-      thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{
-	const vobj *temp;
-	vobj temp2;
-	int permute_type;
-	StencilEntry *SE;
-	SE = phiStencil.GetEntry(permute_type, point, i);
-	  
-	if ( SE->_is_local ) {
-	  temp = &p_v[SE->_offset];
-	  if ( SE->_permute ) {
-	    permute(temp2, *temp, permute_type);
-	    force_v[i] -= temp2;
-	  } else {
-	    force_v[i] -= *temp;
-	  }
-	} else {
-	  force_v[i] -= phiStencil.CommBuf()[SE->_offset];
-	}
-      });
-    }
-  }
-};
-  
-NAMESPACE_END(Grid);
-
-#endif  // SCALAR_INT_ACTION_H
diff --git a/lib/qcd/hmc/.dirstamp b/lib/qcd/hmc/.dirstamp
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/qcd/hmc/integrators/.dirstamp b/lib/qcd/hmc/integrators/.dirstamp
deleted file mode 100644
index e69de29b..00000000
diff --git a/lib/serialisation/BaseIO.h b/lib/serialisation/BaseIO.h
deleted file mode 100644
index b8ca5e16..00000000
--- a/lib/serialisation/BaseIO.h
+++ /dev/null
@@ -1,528 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/serialisation/BaseIO.h
-
-    Copyright (C) 2015
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_SERIALISATION_ABSTRACT_READER_H
-#define GRID_SERIALISATION_ABSTRACT_READER_H
-
-#include <type_traits>
-
-NAMESPACE_BEGIN(Grid);
-
-// Vector IO utilities ///////////////////////////////////////////////////////
-// helper function to read space-separated values
-template <typename T>
-std::vector<T> strToVec(const std::string s)
-{
-  std::istringstream sstr(s);
-  T                  buf;
-  std::vector<T>     v;
-    
-  while(!sstr.eof())
-    {
-      sstr >> buf;
-      v.push_back(buf);
-    }
-    
-  return v;
-}
-  
-// output to streams for vectors
-template < class T >
-inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
-{
-  os << "[";
-  for (auto &x: v)
-    {
-      os << x << " ";
-    }
-  if (v.size() > 0)
-    {
-      os << "\b";
-    }
-  os << "]";
-    
-  return os;
-}
-  
-// Vector element trait //////////////////////////////////////////////////////  
-template <typename T>
-struct element
-{
-  typedef T type;
-  static constexpr bool is_number = false;
-};
-  
-template <typename T>
-struct element<std::vector<T>>
-{
-  typedef typename element<T>::type type;
-  static constexpr bool is_number = std::is_arithmetic<T>::value
-    or is_complex<T>::value
-    or element<T>::is_number;
-};
-  
-// Vector flattening utility class ////////////////////////////////////////////
-// Class to flatten a multidimensional std::vector
-template <typename V>
-class Flatten
-{
-public:
-  typedef typename element<V>::type Element;
-public:
-  explicit                     Flatten(const V &vector);
-  const V &                    getVector(void);
-  const std::vector<Element> & getFlatVector(void);
-  const std::vector<size_t>  & getDim(void);
-private:
-  void accumulate(const Element &e);
-  template <typename W>
-  void accumulate(const W &v);
-  void accumulateDim(const Element &e);
-  template <typename W>
-  void accumulateDim(const W &v);
-private:
-  const V              &vector_;
-  std::vector<Element> flatVector_;
-  std::vector<size_t>  dim_;
-};
-  
-// Class to reconstruct a multidimensional std::vector
-template <typename V>
-class Reconstruct
-{
-public:
-  typedef typename element<V>::type Element;
-public:
-  Reconstruct(const std::vector<Element> &flatVector,
-	      const std::vector<size_t> &dim);
-  const V &                    getVector(void);
-  const std::vector<Element> & getFlatVector(void);
-  const std::vector<size_t>  & getDim(void);
-private:
-  void fill(std::vector<Element> &v);
-  template <typename W>
-  void fill(W &v);
-  void resize(std::vector<Element> &v, const unsigned int dim);
-  template <typename W>
-  void resize(W &v, const unsigned int dim);
-private:
-  V                          vector_;
-  const std::vector<Element> &flatVector_;
-  std::vector<size_t>        dim_;
-  size_t                     ind_{0};
-  unsigned int               dimInd_{0};
-};
-  
-// Pair IO utilities /////////////////////////////////////////////////////////
-// helper function to parse input in the format "<obj1 obj2>"
-template <typename T1, typename T2>
-inline std::istream & operator>>(std::istream &is, std::pair<T1, T2> &buf)
-{
-  T1 buf1;
-  T2 buf2;
-  char c;
-
-  // Search for "pair" delimiters.
-  do
-    {
-      is.get(c);
-    } while (c != '<' && !is.eof());
-  if (c == '<')
-    {
-      int start = is.tellg();
-      do
-	{
-	  is.get(c);
-	} while (c != '>' && !is.eof());
-      if (c == '>')
-	{
-	  int end = is.tellg();
-	  int psize = end - start - 1;
-
-	  // Only read data between pair limiters.
-	  is.seekg(start);
-	  std::string tmpstr(psize, ' ');
-	  is.read(&tmpstr[0], psize);
-	  std::istringstream temp(tmpstr);
-	  temp >> buf1 >> buf2;
-	  buf = std::make_pair(buf1, buf2);
-	  is.seekg(end);
-	}
-    }
-  is.peek();
-  return is;
-}
-  
-// output to streams for pairs
-template <class T1, class T2>
-inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
-{
-  os << "<" << p.first << " " << p.second << ">";
-  return os;
-}
-
-// Abstract writer/reader classes ////////////////////////////////////////////
-// static polymorphism implemented using CRTP idiom
-class Serializable;
-  
-// Static abstract writer
-template <typename T>
-class Writer
-{
-public:
-  Writer(void);
-  virtual ~Writer(void) = default;
-  void push(const std::string &s);
-  void pop(void);
-  template <typename U>
-  typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
-  write(const std::string& s, const U &output);
-  template <typename U>
-  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
-  write(const std::string& s, const U &output);
-private:
-  T *upcast;
-};
-  
-// Static abstract reader
-template <typename T>
-class Reader
-{
-public:
-  Reader(void);
-  virtual ~Reader(void) = default;
-  bool push(const std::string &s);
-  void pop(void);
-  template <typename U>
-  typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
-  read(const std::string& s, U &output);
-  template <typename U>
-  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
-  read(const std::string& s, U &output);
-protected:
-  template <typename U>
-  void fromString(U &output, const std::string &s);
-private:
-  T *upcast;
-};
-
-// What is the vtype
-template<typename T> struct isReader {
-  static const bool value = false;
-};
-template<typename T> struct isWriter {
-  static const bool value = false;
-}; 
-
-
-
-// Generic writer interface
-// serializable base class
-class Serializable
-{
-public:
-  template <typename T>
-  static inline void write(Writer<T> &WR,const std::string &s,
-			   const Serializable &obj)
-  {}
-    
-  template <typename T>
-  static inline void read(Reader<T> &RD,const std::string &s,
-			  Serializable &obj)
-  {}
-    
-  friend inline std::ostream & operator<<(std::ostream &os,
-					  const Serializable &obj)
-  {
-    return os;
-  }
-};
-  
-// Flatten class template implementation /////////////////////////////////////
-template <typename V>
-void Flatten<V>::accumulate(const Element &e)
-{
-  flatVector_.push_back(e);
-}
-  
-template <typename V>
-template <typename W>
-void Flatten<V>::accumulate(const W &v)
-{
-  for (auto &e: v)
-    {
-      accumulate(e);
-    }
-}
-  
-template <typename V>
-void Flatten<V>::accumulateDim(const Element &e) {};
-  
-template <typename V>
-template <typename W>
-void Flatten<V>::accumulateDim(const W &v)
-{
-  dim_.push_back(v.size());
-  accumulateDim(v[0]);
-}
-  
-template <typename V>
-Flatten<V>::Flatten(const V &vector)
-  : vector_(vector)
-{
-  accumulate(vector_);
-  accumulateDim(vector_);
-}
-  
-template <typename V>
-const V & Flatten<V>::getVector(void)
-{
-  return vector_;
-}
-  
-template <typename V>
-const std::vector<typename Flatten<V>::Element> &
-Flatten<V>::getFlatVector(void)
-{
-  return flatVector_;
-}
-  
-template <typename V>
-const std::vector<size_t> & Flatten<V>::getDim(void)
-{
-  return dim_;
-}
-  
-// Reconstruct class template implementation /////////////////////////////////
-template <typename V>
-void Reconstruct<V>::fill(std::vector<Element> &v)
-{
-  for (auto &e: v)
-    {
-      e = flatVector_[ind_++];
-    }
-}
-  
-template <typename V>
-template <typename W>
-void Reconstruct<V>::fill(W &v)
-{
-  for (auto &e: v)
-    {
-      fill(e);
-    }
-}
-  
-template <typename V>
-void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
-{
-  v.resize(dim_[dim]);
-}
-  
-template <typename V>
-template <typename W>
-void Reconstruct<V>::resize(W &v, const unsigned int dim)
-{
-  v.resize(dim_[dim]);
-  for (auto &e: v)
-    {
-      resize(e, dim + 1);
-    }
-}
-  
-template <typename V>
-Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
-			    const std::vector<size_t> &dim)
-  : flatVector_(flatVector)
-  , dim_(dim)
-{
-  resize(vector_, 0);
-  fill(vector_);
-}
-  
-template <typename V>
-const V & Reconstruct<V>::getVector(void)
-{
-  return vector_;
-}
-  
-template <typename V>
-const std::vector<typename Reconstruct<V>::Element> &
-Reconstruct<V>::getFlatVector(void)
-{
-  return flatVector_;
-}
-  
-template <typename V>
-const std::vector<size_t> & Reconstruct<V>::getDim(void)
-{
-  return dim_;
-}
-  
-// Generic writer interface //////////////////////////////////////////////////
-template <typename T>
-inline void push(Writer<T> &w, const std::string &s) {
-  w.push(s);
-}
-  
-template <typename T>
-inline void push(Writer<T> &w, const char *s)
-{
-  w.push(std::string(s));
-}
-  
-template <typename T>
-inline void pop(Writer<T> &w)
-{
-  w.pop();
-}
-  
-template <typename T, typename U>
-inline void write(Writer<T> &w, const std::string& s, const U &output)
-{
-  w.write(s, output);
-}
-  
-// Generic reader interface
-template <typename T>
-inline bool push(Reader<T> &r, const std::string &s)
-{
-  return r.push(s);
-}
-  
-template <typename T>
-inline bool push(Reader<T> &r, const char *s)
-{
-  return r.push(std::string(s));
-}
-  
-template <typename T>
-inline void pop(Reader<T> &r)
-{
-  r.pop();
-}
-  
-template <typename T, typename U>
-inline void read(Reader<T> &r, const std::string &s, U &output)
-{
-  r.read(s, output);
-}
-  
-// Writer template implementation ////////////////////////////////////////////
-template <typename T>
-Writer<T>::Writer(void)
-{
-  upcast = static_cast<T *>(this);
-}
-  
-template <typename T>
-void Writer<T>::push(const std::string &s)
-{
-  upcast->push(s);
-}
-  
-template <typename T>
-void Writer<T>::pop(void)
-{
-  upcast->pop();
-}
-  
-template <typename T>
-template <typename U>
-typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
-Writer<T>::write(const std::string &s, const U &output)
-{
-  U::write(*this, s, output);
-}
-  
-template <typename T>
-template <typename U>
-typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
-Writer<T>::write(const std::string &s, const U &output)
-{
-  upcast->writeDefault(s, output);
-}
-  
-// Reader template implementation
-template <typename T>
-Reader<T>::Reader(void)
-{
-  upcast = static_cast<T *>(this);
-}
-  
-template <typename T>
-bool Reader<T>::push(const std::string &s)
-{
-  return upcast->push(s);
-}
-  
-template <typename T>
-void Reader<T>::pop(void)
-{
-  upcast->pop();
-}
-  
-template <typename T>
-template <typename U>
-typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
-Reader<T>::read(const std::string &s, U &output)
-{
-  U::read(*this, s, output);
-}
-  
-template <typename T>
-template <typename U>
-typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
-Reader<T>::read(const std::string &s, U &output)
-{
-  upcast->readDefault(s, output);
-}
-  
-template <typename T>
-template <typename U>
-void Reader<T>::fromString(U &output, const std::string &s)
-{
-  std::istringstream is(s);
-    
-  is.exceptions(std::ios::failbit);
-  try
-    {
-      is >> std::boolalpha >> output;
-    }
-  catch(std::istringstream::failure &e)
-    {
-      std::cerr << "numerical conversion failure on '" << s << "' ";
-      std::cerr << "(typeid: " << typeid(U).name() << ")" << std::endl;
-      abort();
-    }
-}
-
-NAMESPACE_END(Grid);
-
-#endif
diff --git a/lib/simd/.dirstamp b/lib/simd/.dirstamp
deleted file mode 100644
index e69de29b..00000000
diff --git a/prerequisites/fftw-3.3.4.tar.gz b/prerequisites/fftw-3.3.4.tar.gz
deleted file mode 100644
index df0f808d..00000000
Binary files a/prerequisites/fftw-3.3.4.tar.gz and /dev/null differ
diff --git a/scripts/copyright b/scripts/copyright
index cc9ed6e5..2401f4aa 100755
--- a/scripts/copyright
+++ b/scripts/copyright
@@ -11,12 +11,16 @@ Grid physics library, www.github.com/paboyle/Grid
 
 Source file: $1
 
-Copyright (C) 2015
-Copyright (C) 2016
+Copyright (C) 2015-2018
 
 EOF
 
-git log $1 | grep Author | sort -u >> message
+git log $1 | grep Author > gitauth
+grep 'Author: '  $1 > fileauth
+
+cat gitauth fileauth | sort -u >> message
+
+rm gitauth fileauth
 
 cat >> message <<EOF
 
@@ -60,4 +64,4 @@ shift
 
 done
 
-
+rm message tmp.fil
diff --git a/scripts/eigen-3.3.5.Tensor.patch b/scripts/eigen-3.3.5.Tensor.patch
new file mode 100644
index 00000000..54984b94
--- /dev/null
+++ b/scripts/eigen-3.3.5.Tensor.patch
@@ -0,0 +1,19 @@
+--- ./Eigen/unsupported/Eigen/CXX11/Tensor	2018-07-23 10:33:42.000000000 +0100
++++ Tensor	2018-08-28 16:15:56.000000000 +0100
+@@ -25,7 +25,7 @@
+ #include <utility>
+ #endif
+ 
+-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
++#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+ 
+ #include "../SpecialFunctions"
+ #include "src/util/CXX11Meta.h"
+@@ -147,6 +147,6 @@
+ 
+ #include "src/Tensor/TensorIO.h"
+ 
+-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
++#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+ 
+ //#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/scripts/filelist b/scripts/filelist
index 74f8e334..6db53687 100755
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -3,7 +3,7 @@
 home=`pwd`
 
 # library Make.inc
-cd $home/lib
+cd $home/Grid
 HFILES=`find . -type f -name '*.h' -not -name '*Hdf5*' -not -path '*/gamma-gen/*' -not -path '*/Old/*' -not -path '*/Eigen/*'`
 HFILES="$HFILES"
 CCFILES=`find . -type f -name '*.cc' -not -path '*/gamma-gen/*' -not -name '*Communicator*.cc' -not -name '*SharedMemory*.cc' -not -name '*Hdf5*'`
diff --git a/scripts/update_eigen.sh b/scripts/update_eigen.sh
index d3d367f5..ab7b8f90 100755
--- a/scripts/update_eigen.sh
+++ b/scripts/update_eigen.sh
@@ -7,8 +7,31 @@ fi
 ARC=$1
 
 INITDIR=`pwd`
-cd ../lib
-echo 'eigen_files =\' > Eigen.inc
-find Eigen -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> Eigen.inc
+
+rm -f Grid/Eigen
+rm -rf Eigen
+
+##################
+#untar
+##################
+tar -xf ${ARC}
+ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
+
+###############################
+# Link to a deterministic name
+###############################
+
+mv ${ARCDIR} Eigen
+ln -s ${INITDIR}/Eigen/Eigen ${INITDIR}/Grid/Eigen
+ln -s ${INITDIR}/Eigen/unsupported/Eigen ${INITDIR}/Grid/Eigen/unsupported
+
+# Eigen source headers
+cd ${INITDIR}/Grid
+echo 'eigen_files =\' > ${INITDIR}/Grid/Eigen.inc
+find -L Eigen -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> ${INITDIR}/Grid/Eigen.inc
+
+###################################
+# back to home
+###################################
 cd ${INITDIR}
 
diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc
index f8604ba4..94821234 100644
--- a/tests/IO/Test_ildg_io.cc
+++ b/tests/IO/Test_ildg_io.cc
@@ -79,7 +79,7 @@ int main (int argc, char ** argv)
   std::cout <<GridLogMessage<<"** Writing out  ILDG conf    *********"<<std::endl;
   std::cout <<GridLogMessage<<"**************************************"<<std::endl;
   std::string file("./ckpoint_ildg.4000");
-  IldgWriter _IldgWriter;
+  IldgWriter _IldgWriter(Fine.IsBoss());
   _IldgWriter.open(file);
   _IldgWriter.writeConfiguration(Umu,4000,std::string("dummy_ildg_LFN"),std::string("dummy_config"));
   _IldgWriter.close();
diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc
index bfae7886..790183f8 100644
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -45,7 +45,8 @@ public:
                           bool , b,
                           std::vector<double>, array,
                           std::vector<std::vector<double> >, twodimarray,
-                          std::vector<std::vector<std::vector<Complex> > >, cmplx3darray
+                          std::vector<std::vector<std::vector<Complex> > >, cmplx3darray,
+                          SpinColourMatrix, scm
                           );
   myclass() {}
   myclass(int i)
@@ -59,6 +60,12 @@ public:
     y=2*i;
     b=true;
     name="bother said pooh";
+    scm()(0, 1)(2, 1) = 2.356;
+    scm()(3, 0)(1, 1) = 1.323;
+    scm()(2, 1)(0, 1) = 5.3336;
+    scm()(0, 2)(1, 1) = 6.336;
+    scm()(2, 1)(2, 2) = 7.344;
+    scm()(1, 1)(2, 0) = 8.3534;
   }
 };
 
@@ -93,8 +100,32 @@ void ioTest(const std::string &filename, const O &object, const std::string &nam
   if (!good) exit(EXIT_FAILURE);
 }
 
+template <typename T>
+void tensorConvTestFn(GridSerialRNG &rng, const std::string label)
+{
+  T    t, ft;
+  Real n;
+  bool good;
+
+  random(rng, t);
+  auto tv = tensorToVec(t);
+  vecToTensor(ft, tv);
+  n    = norm2(t - ft);
+  good = (n == 0);
+  std::cout << label << " norm 2 diff: " << n << " -- " 
+            << (good ? "success" : "failure") << std::endl;
+}
+
+#define tensorConvTest(rng, type) tensorConvTestFn<type>(rng, #type)
+
 int main(int argc,char **argv)
 {
+  Grid_init(&argc,&argv);
+  
+  GridSerialRNG    rng;
+
+  rng.SeedFixedIntegers(std::vector<int>({42,10,81,9}));
+  
   std::cout << "==== basic IO" << std::endl;
   XmlWriter WR("bother.xml");
 
@@ -120,7 +151,7 @@ int main(int argc,char **argv)
   std::cout << "-- serialisable class writing to 'bother.xml'..." << std::endl;
   write(WR,"obj",obj);
   WR.write("obj2", obj);
-  vec.push_back(myclass(1234));
+  vec.push_back(obj);
   vec.push_back(myclass(5678));
   vec.push_back(myclass(3838));
   pair = std::make_pair(myenum::red, myenum::blue);
@@ -131,8 +162,6 @@ int main(int argc,char **argv)
   std::cout << "-- serialisable class comparison:" << std::endl;
   std::cout << "vec[0] == obj: " << ((vec[0] == obj) ? "true" : "false") << std::endl;
   std::cout << "vec[1] == obj: " << ((vec[1] == obj) ? "true" : "false") << std::endl;
-
-  write(WR, "objpair", pair);
   std::cout << "-- pair writing to std::cout:" << std::endl;
   std::cout << pair << std::endl;
 
@@ -141,26 +170,20 @@ int main(int argc,char **argv)
   //// XML
   ioTest<XmlWriter, XmlReader>("iotest.xml", obj, "XML    (object)           ");
   ioTest<XmlWriter, XmlReader>("iotest.xml", vec, "XML    (vector of objects)");
-  ioTest<XmlWriter, XmlReader>("iotest.xml", pair, "XML    (pair of objects)");
   //// binary
   ioTest<BinaryWriter, BinaryReader>("iotest.bin", obj, "binary (object)           ");
   ioTest<BinaryWriter, BinaryReader>("iotest.bin", vec, "binary (vector of objects)");
-  ioTest<BinaryWriter, BinaryReader>("iotest.bin", pair, "binary (pair of objects)");
   //// text
   ioTest<TextWriter, TextReader>("iotest.dat", obj, "text   (object)           ");
   ioTest<TextWriter, TextReader>("iotest.dat", vec, "text   (vector of objects)");
-  ioTest<TextWriter, TextReader>("iotest.dat", pair, "text   (pair of objects)");
   //// text
   ioTest<JSONWriter, JSONReader>("iotest.json", obj,  "JSON   (object)           ");
   ioTest<JSONWriter, JSONReader>("iotest.json", vec,  "JSON   (vector of objects)");
-  ioTest<JSONWriter, JSONReader>("iotest.json", pair, "JSON   (pair of objects)");
 
   //// HDF5
-#undef HAVE_HDF5
 #ifdef HAVE_HDF5
   ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", obj, "HDF5   (object)           ");
   ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", vec, "HDF5   (vector of objects)");
-  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", pair, "HDF5   (pair of objects)");
 #endif
 
   std::cout << "\n==== vector flattening/reconstruction" << std::endl;
@@ -197,68 +220,11 @@ int main(int argc,char **argv)
   std::cout << flatdv.getVector() << std::endl;
   std::cout << std::endl;
 
-
-  std::cout << ".:::::: Testing JSON classes "<< std::endl;
-
-
-  {
-    JSONWriter JW("bother.json");
-
-    // test basic type writing
-    myenum a = myenum::red;
-    push(JW,"BasicTypes");
-    write(JW,std::string("i16"),i16);
-    write(JW,"myenum",a);
-    write(JW,"u16",u16);
-    write(JW,"i32",i32);
-    write(JW,"u32",u32);
-    write(JW,"i64",i64);
-    write(JW,"u64",u64);
-    write(JW,"f",f);
-    write(JW,"d",d);
-    write(JW,"b",b);
-    pop(JW);
-
-
-    // test serializable class writing
-    myclass obj(1234); // non-trivial constructor
-    std::cout << obj << std::endl;
-    std::cout << "-- serialisable class writing to 'bother.json'..." << std::endl;
-    write(JW,"obj",obj);
-    JW.write("obj2", obj);
-
-
-    std::vector<myclass> vec;
-    vec.push_back(myclass(1234));
-    vec.push_back(myclass(5678));
-    vec.push_back(myclass(3838));
-    write(JW, "objvec", vec);
-
-  }
-
-
-  {
-    JSONReader RD("bother.json");
-    myclass jcopy1;
-    std::vector<myclass> jveccopy1;
-    read(RD,"obj",jcopy1);
-    read(RD,"objvec", jveccopy1);
-    std::cout << "Loaded (JSON) -----------------" << std::endl;
-    std::cout << jcopy1 << std::endl << jveccopy1 << std::endl;
-  }
- 
-
-/*
-  // This is still work in progress
-  {
-    // Testing the next element function
-    JSONReader RD("test.json");
-    RD.push("grid");
-    RD.push("Observable");
-    std::string name;
-    read(RD,"name", name);
-  }
-*/
-
-
+  std::cout << "==== Grid tensor to vector test" << std::endl;
+  tensorConvTest(rng, SpinColourMatrix);
+  tensorConvTest(rng, SpinColourVector);
+  tensorConvTest(rng, ColourMatrix);
+  tensorConvTest(rng, ColourVector);
+  tensorConvTest(rng, SpinMatrix);
+  tensorConvTest(rng, SpinVector);
 }
diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc
new file mode 100644
index 00000000..83f80297
--- /dev/null
+++ b/tests/Test_compressed_lanczos_hot_start.cc
@@ -0,0 +1,258 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc
+
+    Copyright (C) 2017
+
+Author: Leans heavily on Christoph Lehner's code
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
+ *  in Grid that were intended to be used to support blocked Aggregates, from
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
+{ 
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
+			      LinearOperatorBase<FineField> &FineOp,
+			      int checkerboard) 
+    // Base constructor
+    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
+  {};
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+#ifdef HAVE_LIME
+    assert(this->subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::ScidacWriter WR(this->_FineGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+#else
+    assert(0);
+#endif
+  }
+
+  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
+  {
+#ifdef HAVE_LIME
+    this->evals_fine.resize(nbasis);
+    this->subspace.resize(nbasis,this->_FineGrid);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_fine);
+    
+    assert(this->evals_fine.size()==nbasis);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
+      
+    }
+    RD.close();
+#else
+    assert(0);
+#endif 
+  }
+
+  void checkpointCoarse(std::string evecs_file,std::string evals_file)
+  {
+#ifdef HAVE_LIME
+    int n = this->evec_coarse.size();
+    emptyUserRecord record;
+    Grid::ScidacWriter WR(this->_CoarseGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<n;k++) {
+      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_coarse);
+#else
+    assert(0);
+#endif
+  }
+
+  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
+  {
+#ifdef HAVE_LIME
+    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
+    this->evals_coarse.resize(nvec);
+    this->evec_coarse.resize(nvec,this->_CoarseGrid);
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_coarse);
+
+    assert(this->evals_coarse.size()==nvec);
+    emptyUserRecord record;
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    Grid::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nvec;k++) {
+      RD.readScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    RD.close();
+#else 
+    assert(0);
+#endif
+  }
+};
+
+int main (int argc, char ** argv) {
+
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  LocalCoherenceLanczosParams Params;
+  {
+    Params.omega.resize(10);
+    Params.blockSize.resize(5);
+    XmlWriter writer("Params_template.xml");
+    write(writer,"Params",Params);
+    std::cout << GridLogMessage << " Written Params_template.xml" <<std::endl;
+  }
+  
+  { 
+    XmlReader reader(std::string("./Params.xml"));
+    read(reader, "Params", Params);
+  }
+
+  int     Ls = (int)Params.omega.size();
+  RealD mass = Params.mass;
+  RealD M5   = Params.M5;
+  std::vector<int> blockSize = Params.blockSize;
+  std::vector<int> latt({32,32,32,32});
+  uint64_t     vol = Ls*latt[0]*latt[1]*latt[2]*latt[3];
+  double   mat_flop= 2.0*1320.0*vol;    
+  // Grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(latt,
+								     GridDefaultSimd(Nd,vComplex::Nsimd()),
+								     GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> fineLatt     = latt;
+  int dims=fineLatt.size();
+  assert(blockSize.size()==dims+1);
+  std::vector<int> coarseLatt(dims);
+  std::vector<int> coarseLatt5d ;
+
+  for (int d=0;d<coarseLatt.size();d++){
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<coarseLatt.size();i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+
+  // Gauge field
+  std::vector<int> seeds4({1,2,3,4});
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4,Umu);
+  //  FieldMetaData header;
+  //  NerscIO::readConfiguration(Umu,header,Params.config);
+
+  std::cout << GridLogMessage << "Lattice dimensions: " << latt << "   Ls: " << Ls << std::endl;
+
+  // ZMobius EO Operator
+  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.);
+  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
+
+  // Eigenvector storage
+  LanczosParams fine  =Params.FineParams;  
+  LanczosParams coarse=Params.CoarseParams;  
+
+  const int Ns1 = fine.Nstop;   const int Ns2 = coarse.Nstop;
+  const int Nk1 = fine.Nk;      const int Nk2 = coarse.Nk;
+  const int Nm1 = fine.Nm;      const int Nm2 = coarse.Nm;
+
+  std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
+  std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
+  assert(Nm2 >= Nm1);
+
+  const int nbasis= 60;
+  assert(nbasis==Ns1);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
+  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
+
+  assert( (Params.doFine)||(Params.doFineRead));
+
+  if ( Params.doFine ) { 
+    std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
+    double t0=-usecond();
+    _LocalCoherenceLanczos.calcFine(fine.Cheby,
+		 fine.Nstop,fine.Nk,fine.Nm,
+		 fine.resid,fine.MaxIt, 
+		 fine.betastp,fine.MinRes);
+    t0+=usecond();
+
+    double t1=-usecond();
+    if ( Params.saveEvecs ) {
+      std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
+      _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
+    }
+    t1+=usecond();
+
+    std::cout << GridLogMessage << "Computation time is " << (t0)/1.0e6 <<" seconds"<<std::endl;
+    if ( Params.saveEvecs )  std::cout << GridLogMessage << "I/O         time is " << (t1)/1.0e6 <<" seconds"<<std::endl;
+    std::cout << GridLogMessage << "Time to solution is " << (t1+t0)/1.0e6 <<" seconds"<<std::endl;
+    std::cout << GridLogMessage << "Done"<<std::endl;
+  }
+
+  Grid_finalize();
+}
+
diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc
index b05d1334..0b8463d9 100644
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -49,6 +49,8 @@ int main (int argc, char ** argv)
 
   const int Ls=8;
 
+  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
+
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -90,18 +92,44 @@ int main (int argc, char ** argv)
   SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
   SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 
-  std::cout << "Starting mixed CG" << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
   MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
   mCG(src_o,result_o);
 
-  std::cout << "Starting regular CG" << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
   CG(HermOpEO,src_o,result_o_2);
 
   LatticeFermionD diff_o(FrbGrid);
   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 
-  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
+
+  #ifdef HAVE_LIME
+  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
+  
+  std::string file1("./Propagator1");
+  emptyUserRecord record;
+  uint32_t nersc_csum;
+  uint32_t scidac_csuma;
+  uint32_t scidac_csumb;
+  typedef SpinColourVectorD   FermionD;
+  typedef vSpinColourVectorD vFermionD;
+
+  BinarySimpleMunger<FermionD,FermionD> munge;
+  std::string format = getFormatString<vFermionD>();
+  
+  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
+						   nersc_csum,scidac_csuma,scidac_csumb);
+
+  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+
+  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
+						   nersc_csum,scidac_csuma,scidac_csumb);
+
+  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+  }
+  #endif
 
   
   Grid_finalize();
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index b8a12342..2ba3752b 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -309,7 +309,8 @@ int main (int argc, char ** argv)
 
     // Momentum space prop
     std::cout << " Solving by FFT and Feynman rules" <<std::endl;
-    Ddwf.FreePropagator(src,ref,mass) ;
+    bool fiveD = false; //calculate 4d free propagator
+    Ddwf.FreePropagator(src,ref,mass,fiveD) ;
 
     Gamma G5(Gamma::Algebra::Gamma5);
 
diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc
index 786cff4e..a023e0d1 100644
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -392,7 +392,6 @@ int main(int argc, char **argv) {
       }
       random(Foo);
       */
-      lex_sites(Foo);
 
       Integer mm[4];
       mm[0] = 1;
diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc
index fedd445d..190f0eb8 100644
--- a/tests/core/Test_staggered5Dvec.cc
+++ b/tests/core/Test_staggered5Dvec.cc
@@ -141,6 +141,7 @@ int main (int argc, char ** argv)
   t1=usecond();
  
   std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
+  std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
   std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 
@@ -160,7 +161,8 @@ int main (int argc, char ** argv)
   localConvert(sresult,tmp);
  
   std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
-  std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
+  std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
+  std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 
 
@@ -181,6 +183,7 @@ int main (int argc, char ** argv)
   localConvert(sresult,tmp);
  
   std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
+  std::cout<<GridLogMessage << "norm ssrc   "<< norm2(ssrc)<<std::endl;
   std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)*extra<<std::endl;
 
diff --git a/tests/core/Test_staggered5DvecF.cc b/tests/core/Test_staggered5DvecF.cc
new file mode 100644
index 00000000..5d421673
--- /dev/null
+++ b/tests/core/Test_staggered5DvecF.cc
@@ -0,0 +1,196 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_wilson.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  const int Ls=16;
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  int threads = GridThread::GetThreads();
+
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG4(UGrid);
+  GridParallelRNG          pRNG5(FGrid);
+  pRNG4.SeedFixedIntegers(seeds);
+  pRNG5.SeedFixedIntegers(seeds);
+
+  typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField; 
+  typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField; 
+  typename ImprovedStaggeredFermion5DF::ImplParams params; 
+
+  FermionField src   (FGrid);
+  random(pRNG5,src);
+  /*
+  std::vector<int> site({0,1,2,0,0});
+  ColourVector cv = zero;
+  cv()()(0)=1.0;
+  src = zero;
+  pokeSite(cv,src,site);
+  */
+  FermionField result(FGrid); result=zero;
+  FermionField    tmp(FGrid);    tmp=zero;
+  FermionField    err(FGrid);    tmp=zero;
+  FermionField phi   (FGrid); random(pRNG5,phi);
+  FermionField chi   (FGrid); random(pRNG5,chi);
+
+  LatticeGaugeFieldF Umu(UGrid);
+  SU3::HotConfiguration(pRNG4,Umu);
+
+  /*
+  for(int mu=1;mu<4;mu++){
+    auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
+        tmp = zero;
+    PokeIndex<LorentzIndex>(Umu,tmp,mu);
+  }
+  */
+  double volume=Ls;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  RealD mass=0.1;
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+
+  ImprovedStaggeredFermion5DF     Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
+  ImprovedStaggeredFermionVec5dF sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
+
+  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation         "<<std::endl;
+  std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
+
+  int ncall=1000;
+  int ncall1=1000;
+  double t0(0),t1(0);
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+
+  std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
+  t0=usecond();
+  for(int i=0;i<ncall1;i++){
+    Ds.Dhop(src,result,0);
+  }
+  t1=usecond();
+
+  
+  std::cout<<GridLogMessage << "Called Ds"<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
+
+#ifdef AVX512
+  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
+#else
+  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
+#endif
+
+  t0=usecond();
+  for(int i=0;i<ncall1;i++){
+    Ds.Dhop(src,tmp,0);
+  }
+  t1=usecond();
+ 
+  std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
+  std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+  err = tmp-result; 
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+
+  
+  FermionField ssrc  (sFGrid);  localConvert(src,ssrc);
+  FermionField sresult(sFGrid); sresult=zero;
+
+  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
+  t0=usecond();
+  for(int i=0;i<ncall1;i++){
+    sDs.Dhop(ssrc,sresult,0);
+  }
+  t1=usecond();
+  localConvert(sresult,tmp);
+ 
+  std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
+  std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
+  std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+
+
+#ifdef AVX512
+  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
+#else
+  QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
+#endif
+
+  err = tmp-result; 
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+  int extra=1;
+  t0=usecond();
+  for(int i=0;i<ncall1*extra;i++){
+    sDs.Dhop(ssrc,sresult,0);
+  }
+  t1=usecond();
+  localConvert(sresult,tmp);
+ 
+  std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
+  std::cout<<GridLogMessage << "norm ssrc   "<< norm2(ssrc)<<std::endl;
+  std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)*extra<<std::endl;
+
+  err = tmp-result; 
+  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+
+
+
+  Grid_finalize();
+}
diff --git a/tests/core/Test_wilson_clover.cc b/tests/core/Test_wilson_clover.cc
new file mode 100644
index 00000000..9281e298
--- /dev/null
+++ b/tests/core/Test_wilson_clover.cc
@@ -0,0 +1,357 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_wilson.cc
+
+    Copyright (C) 2015
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  std::vector<int> latt_size = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
+  std::vector<int> mpi_layout = GridDefaultMpi();
+  GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+  GridRedBlackCartesian RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+  std::cout << GridLogMessage << "Grid floating point word size is REALF" << sizeof(RealF) << std::endl;
+  std::cout << GridLogMessage << "Grid floating point word size is REALD" << sizeof(RealD) << std::endl;
+  std::cout << GridLogMessage << "Grid floating point word size is REAL" << sizeof(Real) << std::endl;
+
+  std::vector<int> seeds({1, 2, 3, 4});
+  GridParallelRNG pRNG(&Grid);
+  pRNG.SeedFixedIntegers(seeds);
+  //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
+
+  typedef typename WilsonCloverFermionR::FermionField FermionField;
+  typename WilsonCloverFermionR::ImplParams params;
+  WilsonAnisotropyCoefficients anis;
+
+  FermionField src(&Grid);
+  random(pRNG, src);
+  FermionField result(&Grid);
+  result = zero;
+  FermionField result2(&Grid);
+  result2 = zero;
+  FermionField ref(&Grid);
+  ref = zero;
+  FermionField tmp(&Grid);
+  tmp = zero;
+  FermionField err(&Grid);
+  err = zero;
+  FermionField err2(&Grid);
+  err2 = zero;
+  FermionField phi(&Grid);
+  random(pRNG, phi);
+  FermionField chi(&Grid);
+  random(pRNG, chi);
+  LatticeGaugeField Umu(&Grid);
+  SU3::HotConfiguration(pRNG, Umu);
+  std::vector<LatticeColourMatrix> U(4, &Grid);
+
+  double volume = 1;
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    volume = volume * latt_size[mu];
+  }
+
+  RealD mass = 0.1;
+  RealD csw_r = 1.0;
+  RealD csw_t = 1.0;
+
+  WilsonCloverFermionR Dwc(Umu, Grid, RBGrid, mass, csw_r, csw_t, anis, params);
+  //Dwc.ImportGauge(Umu); // not necessary, included in the constructor
+
+  std::cout << GridLogMessage << "==========================================================" << std::endl;
+  std::cout << GridLogMessage << "= Testing that Deo + Doe = Dunprec " << std::endl;
+  std::cout << GridLogMessage << "==========================================================" << std::endl;
+
+  FermionField src_e(&RBGrid);
+  FermionField src_o(&RBGrid);
+  FermionField r_e(&RBGrid);
+  FermionField r_o(&RBGrid);
+  FermionField r_eo(&Grid);
+  pickCheckerboard(Even, src_e, src);
+  pickCheckerboard(Odd, src_o, src);
+
+  Dwc.Meooe(src_e, r_o);
+  std::cout << GridLogMessage << "Applied Meo" << std::endl;
+  Dwc.Meooe(src_o, r_e);
+  std::cout << GridLogMessage << "Applied Moe" << std::endl;
+  Dwc.Dhop(src, ref, DaggerNo);
+
+  setCheckerboard(r_eo, r_o);
+  setCheckerboard(r_eo, r_e);
+
+  err = ref - r_eo;
+  std::cout << GridLogMessage << "EO norm diff   " << norm2(err) << " " << norm2(ref) << " " << norm2(r_eo) << std::endl;
+
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+  std::cout << GridLogMessage << "= Test Ddagger is the dagger of D by requiring                " << std::endl;
+  std::cout << GridLogMessage << "=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  " << std::endl;
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+  FermionField chi_e(&RBGrid);
+  FermionField chi_o(&RBGrid);
+
+  FermionField dchi_e(&RBGrid);
+  FermionField dchi_o(&RBGrid);
+
+  FermionField phi_e(&RBGrid);
+  FermionField phi_o(&RBGrid);
+
+  FermionField dphi_e(&RBGrid);
+  FermionField dphi_o(&RBGrid);
+
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+  pickCheckerboard(Even, phi_e, phi);
+  pickCheckerboard(Odd, phi_o, phi);
+
+  Dwc.Meooe(chi_e, dchi_o);
+  Dwc.Meooe(chi_o, dchi_e);
+  Dwc.MeooeDag(phi_e, dphi_o);
+  Dwc.MeooeDag(phi_o, dphi_e);
+
+  ComplexD pDce = innerProduct(phi_e, dchi_e);
+  ComplexD pDco = innerProduct(phi_o, dchi_o);
+  ComplexD cDpe = innerProduct(chi_e, dphi_e);
+  ComplexD cDpo = innerProduct(chi_o, dphi_o);
+
+  std::cout << GridLogMessage << "e " << pDce << " " << cDpe << std::endl;
+  std::cout << GridLogMessage << "o " << pDco << " " << cDpo << std::endl;
+
+  std::cout << GridLogMessage << "pDce - conj(cDpo) " << pDce - conj(cDpo) << std::endl;
+  std::cout << GridLogMessage << "pDco - conj(cDpe) " << pDco - conj(cDpe) << std::endl;
+
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+  std::cout << GridLogMessage << "= Test MeeInv Mee = 1   (if csw!=0)                           " << std::endl;
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+
+  Dwc.Mooee(chi_e, src_e);
+  Dwc.MooeeInv(src_e, phi_e);
+
+  Dwc.Mooee(chi_o, src_o);
+  Dwc.MooeeInv(src_o, phi_o);
+
+  setCheckerboard(phi, phi_e);
+  setCheckerboard(phi, phi_o);
+
+  err = phi - chi;
+  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+  std::cout << GridLogMessage << "= Test MeeDag MeeInvDag = 1    (if csw!=0)                    " << std::endl;
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+
+  Dwc.MooeeDag(chi_e, src_e);
+  Dwc.MooeeInvDag(src_e, phi_e);
+
+  Dwc.MooeeDag(chi_o, src_o);
+  Dwc.MooeeInvDag(src_o, phi_o);
+
+  setCheckerboard(phi, phi_e);
+  setCheckerboard(phi, phi_o);
+
+  err = phi - chi;
+  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+  std::cout << GridLogMessage << "= Test MeeInv MeeDag = 1      (if csw!=0)                     " << std::endl;
+  std::cout << GridLogMessage << "==============================================================" << std::endl;
+
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+
+  Dwc.MooeeDag(chi_e, src_e);
+  Dwc.MooeeInv(src_e, phi_e);
+
+  Dwc.MooeeDag(chi_o, src_o);
+  Dwc.MooeeInv(src_o, phi_o);
+
+  setCheckerboard(phi, phi_e);
+  setCheckerboard(phi, phi_o);
+
+  err = phi - chi;
+  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+  std::cout << GridLogMessage << "================================================================" << std::endl;
+  std::cout << GridLogMessage << "= Testing gauge covariance Clover term with EO preconditioning  " << std::endl;
+  std::cout << GridLogMessage << "================================================================" << std::endl;
+
+  chi = zero;
+  phi = zero;
+  tmp = zero;
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+  pickCheckerboard(Even, phi_e, phi);
+  pickCheckerboard(Odd, phi_o, phi);
+
+  Dwc.Mooee(src_e, chi_e);
+  Dwc.Mooee(src_o, chi_o);
+  setCheckerboard(chi, chi_e);
+  setCheckerboard(chi, chi_o);
+  setCheckerboard(src, src_e);
+  setCheckerboard(src, src_o);
+
+  ////////////////////// Gauge Transformation
+  std::vector<int> seeds2({5, 6, 7, 8});
+  GridParallelRNG pRNG2(&Grid);
+  pRNG2.SeedFixedIntegers(seeds2);
+  LatticeColourMatrix Omega(&Grid);
+  LatticeColourMatrix ShiftedOmega(&Grid);
+  LatticeGaugeField U_prime(&Grid);
+  U_prime = zero;
+  LatticeColourMatrix U_prime_mu(&Grid);
+  U_prime_mu = zero;
+  SU<Nc>::LieRandomize(pRNG2, Omega, 1.0);
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    U[mu] = peekLorentz(Umu, mu);
+    ShiftedOmega = Cshift(Omega, mu, 1);
+    U_prime_mu = Omega * U[mu] * adj(ShiftedOmega);
+    pokeLorentz(U_prime, U_prime_mu, mu);
+  }
+  /////////////////
+
+  WilsonCloverFermionR Dwc_prime(U_prime, Grid, RBGrid, mass, csw_r, csw_t, anis, params);
+  Dwc_prime.ImportGauge(U_prime);
+
+  tmp = Omega * src;
+  pickCheckerboard(Even, src_e, tmp);
+  pickCheckerboard(Odd, src_o, tmp);
+
+  Dwc_prime.Mooee(src_e, phi_e);
+  Dwc_prime.Mooee(src_o, phi_o);
+
+  setCheckerboard(phi, phi_e);
+  setCheckerboard(phi, phi_o);
+
+  err = chi - adj(Omega) * phi;
+  std::cout << GridLogMessage << "norm diff   " << norm2(err) << std::endl;
+
+  std::cout << GridLogMessage << "=================================================================" << std::endl;
+  std::cout << GridLogMessage << "= Testing gauge covariance Clover term w/o EO preconditioning  " << std::endl;
+  std::cout << GridLogMessage << "================================================================" << std::endl;
+
+  chi = zero;
+  phi = zero;
+
+  WilsonFermionR Dw(Umu, Grid, RBGrid, mass, params);
+  Dw.ImportGauge(Umu);
+
+  Dw.M(src, result);
+  Dwc.M(src, chi);
+
+  Dwc_prime.M(Omega * src, phi);
+
+  WilsonFermionR Dw_prime(U_prime, Grid, RBGrid, mass, params);
+  Dw_prime.ImportGauge(U_prime);
+  Dw_prime.M(Omega * src, result2);
+
+  err = chi - adj(Omega) * phi;
+  err2 = result - adj(Omega) * result2;
+  std::cout << GridLogMessage << "norm diff Wilson   " << norm2(err) << std::endl;
+  std::cout << GridLogMessage << "norm diff WilsonClover  " << norm2(err2) << std::endl;
+
+  std::cout << GridLogMessage << "==========================================================" << std::endl;
+  std::cout << GridLogMessage << "= Testing Mooee(csw=0) Clover to reproduce Mooee Wilson   " << std::endl;
+  std::cout << GridLogMessage << "==========================================================" << std::endl;
+
+  chi = zero;
+  phi = zero;
+  err = zero;
+  WilsonCloverFermionR Dwc_csw0(Umu, Grid, RBGrid, mass, 0.0, 0.0, anis, params); //  <-- Notice: csw=0
+  Dwc_csw0.ImportGauge(Umu);
+
+  pickCheckerboard(Even, phi_e, phi);
+  pickCheckerboard(Odd, phi_o, phi);
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+
+  Dw.Mooee(src_e, chi_e);
+  Dw.Mooee(src_o, chi_o);
+  Dwc_csw0.Mooee(src_e, phi_e);
+  Dwc_csw0.Mooee(src_o, phi_o);
+
+  setCheckerboard(chi, chi_e);
+  setCheckerboard(chi, chi_o);
+  setCheckerboard(phi, phi_e);
+  setCheckerboard(phi, phi_o);
+  setCheckerboard(src, src_e);
+  setCheckerboard(src, src_o);
+
+  err = chi - phi;
+  std::cout << GridLogMessage << "norm diff  " << norm2(err) << std::endl;
+
+  std::cout << GridLogMessage << "==========================================================" << std::endl;
+  std::cout << GridLogMessage << "= Testing EO operator is equal to the unprec              " << std::endl;
+  std::cout << GridLogMessage << "==========================================================" << std::endl;
+
+  chi = zero;
+  phi = zero;
+  err = zero;
+
+  pickCheckerboard(Even, phi_e, phi);
+  pickCheckerboard(Odd, phi_o, phi);
+  pickCheckerboard(Even, chi_e, chi);
+  pickCheckerboard(Odd, chi_o, chi);
+
+  // M phi = (Mooee src_e + Meooe src_o , Meooe src_e + Mooee src_o)
+
+  Dwc.M(src, ref); // Reference result from the unpreconditioned operator
+
+  // EO matrix
+  Dwc.Mooee(src_e, chi_e); 
+  Dwc.Mooee(src_o, chi_o);
+  Dwc.Meooe(src_o, phi_e);
+  Dwc.Meooe(src_e, phi_o);
+
+  phi_o += chi_o;
+  phi_e += chi_e;
+
+  setCheckerboard(phi, phi_e);
+  setCheckerboard(phi, phi_o);
+
+  err = ref - phi;
+  std::cout << GridLogMessage << "ref (unpreconditioned operator) diff  :" << norm2(ref) << std::endl;
+  std::cout << GridLogMessage << "phi (EO decomposition)          diff  :" << norm2(phi) << std::endl;
+  std::cout << GridLogMessage << "norm diff                             :" << norm2(err) << std::endl;
+
+  Grid_finalize();
+}
diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc
index 86dbf8b2..4cdd54d2 100644
--- a/tests/debug/Test_cayley_cg.cc
+++ b/tests/debug/Test_cayley_cg.cc
@@ -1,5 +1,4 @@
     /*************************************************************************************
-
     Grid physics library, www.github.com/paboyle/Grid 
 
     Source file: ./tests/Test_cayley_cg.cc
@@ -27,6 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     *************************************************************************************/
     /*  END LEGAL */
 #include <Grid/Grid.h>
+#include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
 
 using namespace std;
 using namespace Grid;
@@ -46,6 +46,7 @@ struct scal {
 
 template<class What> 
 void  TestCGinversions(What & Ddwf, 
+		       LatticeGaugeField &Umu,
 		       GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
 		       GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		       RealD mass, RealD M5,
@@ -75,6 +76,25 @@ void  TestCGprec(What & Ddwf,
 		 GridParallelRNG *RNG4,
 		 GridParallelRNG *RNG5);
 
+template<class What> 
+void  TestReconstruct5D(What & Ddwf, 
+			LatticeGaugeField &Umu,
+			GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
+			GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
+			RealD mass, RealD M5,
+			GridParallelRNG *RNG4,
+			GridParallelRNG *RNG5);
+
+template<class What,class WhatF> 
+void  TestReconstruct5DFA(What & Ddwf, 
+			  WhatF & DdwfF, 
+			  LatticeGaugeField &Umu,
+			  GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
+			  GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
+			  RealD mass, RealD M5,
+			  GridParallelRNG *RNG4,
+			  GridParallelRNG *RNG5);
+
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -83,63 +103,104 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   const int Ls=8;
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 
 
+  GridCartesian         * UGridF   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								    GridDefaultSimd(Nd,vComplexF::Nsimd()),
+								    GridDefaultMpi());
+  GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
+  GridCartesian         * FGridF   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
+  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
+
+
   std::vector<int> seeds4({1,2,3,4});
   std::vector<int> seeds5({5,6,7,8});
   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
   LatticeGaugeField Umu(UGrid);
+  LatticeGaugeFieldF UmuF(UGridF);
   SU3::HotConfiguration(RNG4,Umu);
+  precisionChange(UmuF,Umu);
   std::vector<LatticeColourMatrix> U(4,UGrid);
 
   RealD mass=0.1;
   RealD M5  =1.8;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  TestCGinversions<DomainWallFermionR>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  DomainWallFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5);
+  TestCGinversions<DomainWallFermionR>(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5DFA<DomainWallFermionR,DomainWallFermionF>(Ddwf,DdwfF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
   RealD b=1.5;// Scale factor b+c=2, b-c=1
   RealD c=0.5;
   std::vector<ComplexD> gamma(Ls,ComplexD(1.0,0.0));
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
-  TestCGinversions<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  MobiusFermionF DmobF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,b,c);
+  TestCGinversions<MobiusFermionR>(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5DFA<MobiusFermionR,MobiusFermionF>(Dmob,DmobF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"ZMobiusFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   ZMobiusFermionR ZDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c);
-  TestCGinversions<ZMobiusFermionR>(ZDmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestCGinversions<ZMobiusFermionR>(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5D<ZMobiusFermionR>(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   MobiusZolotarevFermionR Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0);
-  TestCGinversions<MobiusZolotarevFermionR>(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestCGinversions<MobiusZolotarevFermionR>(Dzolo,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5D<MobiusZolotarevFermionR>(Dzolo,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
-  TestCGinversions<ScaledShamirFermionR>(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  ScaledShamirFermionF DshamF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,2.0);
+  TestCGinversions<ScaledShamirFermionR>(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5DFA<ScaledShamirFermionR,ScaledShamirFermionF>(Dsham,DshamF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"ShamirZolotarevFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   ShamirZolotarevFermionR Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
-  TestCGinversions<ShamirZolotarevFermionR>(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestCGinversions<ShamirZolotarevFermionR>(Dshamz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5D<ShamirZolotarevFermionR>(Dshamz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"OverlapWilsonCayleyTanhFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   OverlapWilsonCayleyTanhFermionR Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
-  TestCGinversions<OverlapWilsonCayleyTanhFermionR>(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  OverlapWilsonCayleyTanhFermionF DovF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,1.0);
+  TestCGinversions<OverlapWilsonCayleyTanhFermionR>(Dov,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5DFA<OverlapWilsonCayleyTanhFermionR,OverlapWilsonCayleyTanhFermionF>(Dov,DovF,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   std::cout<<GridLogMessage <<"OverlapWilsonCayleyZolotarevFermion test"<<std::endl;
+  std::cout<<GridLogMessage <<"======================"<<std::endl;
   OverlapWilsonCayleyZolotarevFermionR Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
-  TestCGinversions<OverlapWilsonCayleyZolotarevFermionR>(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestCGinversions<OverlapWilsonCayleyZolotarevFermionR>(Dovz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestReconstruct5D<OverlapWilsonCayleyZolotarevFermionR>(Dovz,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
 
   Grid_finalize();
 }
 template<class What> 
 void  TestCGinversions(What & Ddwf, 
+		       LatticeGaugeField &Umu,
 		       GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
 		       GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		       RealD mass, RealD M5,
@@ -154,6 +215,7 @@ void  TestCGinversions(What & Ddwf,
   TestCGschur<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
 }
 
+
 template<class What> 
 void  TestCGunprec(What & Ddwf, 
 		   GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
@@ -189,6 +251,147 @@ void  TestCGprec(What & Ddwf,
   CG(HermOpEO,src_o,result_o);
 }
 
+template<class What> 
+void  TestReconstruct5D(What & Ddwf, 
+			LatticeGaugeField & Umu,
+			GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
+			GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
+			RealD mass, RealD M5,
+			GridParallelRNG *RNG4,
+			GridParallelRNG *RNG5)
+{
+  LatticeFermion src4   (UGrid); random(*RNG4,src4);
+  LatticeFermion res4   (UGrid); res4 = zero;
+
+  LatticeFermion src   (FGrid);
+  LatticeFermion src_NE(FGrid);
+  LatticeFermion result(FGrid);
+  LatticeFermion result_rec(FGrid);
+  LatticeFermion result_madwf(FGrid);
+
+  MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
+  double Resid = 1.0e-12;
+  double Residi = 1.0e-6;
+  ConjugateGradient<LatticeFermion> CG(Resid,10000);
+  ConjugateGradient<LatticeFermion> CGi(Residi,10000);
+
+  Ddwf.ImportPhysicalFermionSource(src4,src);
+  Ddwf.Mdag(src,src_NE);
+  CG(HermOp,src_NE,result);
+
+  Ddwf.ExportPhysicalFermionSolution(result, res4);
+
+  Ddwf.M(result,src_NE);
+  src_NE = src_NE - src;
+  std::cout <<GridLogMessage<< " True residual is " << norm2(src_NE)<<std::endl;
+
+  std::cout <<GridLogMessage<< " Reconstructing " <<std::endl;
+
+  ////////////////////////////
+  // RBprec PV inverse
+  ////////////////////////////
+  typedef LatticeFermion Field;
+  typedef SchurRedBlackDiagTwoSolve<Field> SchurSolverType; 
+  typedef SchurRedBlackDiagTwoSolve<Field> SchurSolverTypei; 
+  typedef PauliVillarsSolverRBprec<Field,SchurSolverType> PVinverter;
+  SchurSolverType SchurSolver(CG);
+  PVinverter      PVinverse(SchurSolver);
+
+  Reconstruct5DfromPhysical<LatticeFermion,PVinverter> reconstructor(PVinverse);
+
+  reconstructor(Ddwf,res4,src4,result_rec);
+
+  std::cout <<GridLogMessage << "Result     "<<norm2(result)<<std::endl;
+  std::cout <<GridLogMessage << "Result_rec "<<norm2(result_rec)<<std::endl;
+
+  result_rec = result_rec - result;
+  std::cout <<GridLogMessage << "Difference "<<norm2(result_rec)<<std::endl;
+
+  //////////////////////////////
+  // Now try MADWF
+  //////////////////////////////
+  SchurSolverTypei SchurSolveri(CGi);
+  ZeroGuesser<LatticeFermion> Guess;
+  MADWF<What,What,PVinverter,SchurSolverTypei,ZeroGuesser<LatticeFermion> > 
+    madwf(Ddwf,Ddwf,PVinverse,SchurSolveri,Guess,Resid,10);
+  
+  madwf(src4,result_madwf);
+  result_madwf = result_madwf - result;
+  std::cout <<GridLogMessage << "Difference "<<norm2(result_madwf)<<std::endl;
+
+
+}
+template<class What,class WhatF> 
+void  TestReconstruct5DFA(What & Ddwf, 
+			  WhatF & DdwfF, 
+			  LatticeGaugeField & Umu,
+			  GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
+			  GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
+			  RealD mass, RealD M5,
+			  GridParallelRNG *RNG4,
+			  GridParallelRNG *RNG5)
+{
+  LatticeFermion src4   (UGrid); random(*RNG4,src4);
+  LatticeFermion res4   (UGrid); res4 = zero;
+
+  LatticeFermion src   (FGrid);
+  LatticeFermion src_NE(FGrid);
+  LatticeFermion result(FGrid);
+  LatticeFermion result_rec(FGrid);
+  LatticeFermion result_madwf(FGrid);
+
+  MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
+  double Resid = 1.0e-12;
+  double Residi = 1.0e-5;
+  ConjugateGradient<LatticeFermion> CG(Resid,10000);
+  ConjugateGradient<LatticeFermionF> CGi(Residi,10000);
+
+  Ddwf.ImportPhysicalFermionSource(src4,src);
+  Ddwf.Mdag(src,src_NE);
+  CG(HermOp,src_NE,result);
+
+  Ddwf.ExportPhysicalFermionSolution(result, res4);
+
+  Ddwf.M(result,src_NE);
+  src_NE = src_NE - src;
+  std::cout <<GridLogMessage<< " True residual is " << norm2(src_NE)<<std::endl;
+
+  std::cout <<GridLogMessage<< " Reconstructing " <<std::endl;
+
+  ////////////////////////////
+  // Fourier accel PV inverse
+  ////////////////////////////
+  typedef LatticeFermion Field;
+  typedef LatticeFermionF FieldF;
+  typedef SchurRedBlackDiagTwoSolve<FieldF> SchurSolverTypei; 
+  typedef PauliVillarsSolverFourierAccel<LatticeFermion,LatticeGaugeField> PVinverter;
+  PVinverter PVinverse(Umu,CG);
+
+  Reconstruct5DfromPhysical<LatticeFermion,PVinverter> reconstructor(PVinverse);
+
+  reconstructor(Ddwf,res4,src4,result_rec);
+
+  std::cout <<GridLogMessage << "Result     "<<norm2(result)<<std::endl;
+  std::cout <<GridLogMessage << "Result_rec "<<norm2(result_rec)<<std::endl;
+
+  result_rec = result_rec - result;
+  std::cout <<GridLogMessage << "Difference "<<norm2(result_rec)<<std::endl;
+
+  //////////////////////////////
+  // Now try MADWF
+  //////////////////////////////
+  SchurSolverTypei SchurSolver(CGi);
+  ZeroGuesser<LatticeFermionF> Guess;
+  MADWF<What,WhatF,PVinverter,SchurSolverTypei,ZeroGuesser<LatticeFermionF> > 
+    madwf(Ddwf,DdwfF,PVinverse,SchurSolver,Guess,Resid,10);
+  
+  madwf(src4,result_madwf);
+  result_madwf = result_madwf - result;
+  std::cout <<GridLogMessage << "Difference "<<norm2(result_madwf)<<std::endl;
+
+}
+
+
 
 template<class What> 
 void  TestCGschur(What & Ddwf, 
diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc
index 5004eaca..e6e18250 100644
--- a/tests/debug/Test_cayley_coarsen_support.cc
+++ b/tests/debug/Test_cayley_coarsen_support.cc
@@ -111,6 +111,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage<<"Error "<<norm2(err)<<std::endl;
 
   const int nbasis = 2;
+  const int cb = 0 ;
   LatticeFermion prom(FGrid);
 
   std::vector<LatticeFermion> subspace(nbasis,FGrid);
@@ -119,7 +120,7 @@ int main (int argc, char ** argv)
 
   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
   typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
-  int cb = 0;
+
   Subspace Aggregates(Coarse5d,FGrid,cb);
   Aggregates.CreateSubspaceRandom(RNG5);
 
diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc
index 2a196bfe..82f388ab 100644
--- a/tests/debug/Test_cayley_ldop_cr.cc
+++ b/tests/debug/Test_cayley_ldop_cr.cc
@@ -78,6 +78,7 @@ int main (int argc, char ** argv)
 
   RealD mass=0.1;
   RealD M5=1.5;
+  int cb=0;
 
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
@@ -95,10 +96,9 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
   std::cout<<GridLogMessage << "**************************************************"<< std::endl;
   MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
-  Subspace Aggregates(Coarse5d,FGrid,0);
+  Subspace Aggregates(Coarse5d,FGrid,cb);
   Aggregates.CreateSubspace(RNG5,HermDefOp);
 
-
   LittleDiracOperator LittleDiracOp(*Coarse5d);
   LittleDiracOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
   
diff --git a/tests/debug/Test_split_laplacian.cc b/tests/debug/Test_split_laplacian.cc
new file mode 100644
index 00000000..174dc1b1
--- /dev/null
+++ b/tests/debug/Test_split_laplacian.cc
@@ -0,0 +1,104 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  typedef LatticeComplex ComplexField; 
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  int nd   = latt_size.size();
+  int ndm1 = nd-1;
+
+  std::vector<int> simd_layout = GridDefaultSimd(nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (mpi_layout.size(),1);
+
+  std::cout << " Full " << GridCmdVectorIntToString(latt_size)  << " subgrid"         <<std::endl;
+  std::cout << " Full " << GridCmdVectorIntToString(mpi_layout) << " sub communicator"<<std::endl;
+  std::cout << " Full " << GridCmdVectorIntToString(simd_layout)<< " simd layout "    <<std::endl;
+
+  GridCartesian         * GridN = new GridCartesian(latt_size,
+						    simd_layout,
+						    mpi_layout);
+
+  std::vector<int> latt_m  = latt_size;   latt_m[nd-1] = 1;
+  std::vector<int> mpi_m   = mpi_layout;  mpi_m [nd-1] = 1;
+  std::vector<int> simd_m  = GridDefaultSimd(ndm1,vComplex::Nsimd()); simd_m.push_back(1);
+
+
+  std::cout << " Requesting " << GridCmdVectorIntToString(latt_m)<< " subgrid"         <<std::endl;
+  std::cout << " Requesting " << GridCmdVectorIntToString(mpi_m) << " sub communicator"<<std::endl;
+  std::cout << " Requesting " << GridCmdVectorIntToString(simd_m)<< " simd layout "    <<std::endl;
+  GridCartesian         * Grid_m = new GridCartesian(latt_m,
+						     simd_m,
+						     mpi_m,
+						     *GridN); 
+
+  Complex C(1.0);
+  Complex tmp;
+
+  ComplexField Full(GridN); Full = C;
+  ComplexField Full_cpy(GridN);
+  ComplexField Split(Grid_m);Split= C;
+
+  std::cout << GridLogMessage<< " Full  volume "<< norm2(Full) <<std::endl;
+  std::cout << GridLogMessage<< " Split volume "<< norm2(Split) <<std::endl;
+
+  tmp=C;
+  GridN->GlobalSum(tmp);
+  std::cout << GridLogMessage<< " Full  nodes "<< tmp <<std::endl;
+
+  tmp=C;
+  Grid_m->GlobalSum(tmp);
+  std::cout << GridLogMessage<< " Split nodes "<< tmp <<std::endl;
+  GridN->Barrier();
+
+  auto local_latt = GridN->LocalDimensions();
+
+  Full_cpy = zero;
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          RNG(GridN);  RNG.SeedFixedIntegers(seeds);
+
+  random(RNG,Full);
+  for(int t=0;t<local_latt[nd-1];t++){
+    ExtractSliceLocal(Split,Full,0,t,Tp);
+    InsertSliceLocal (Split,Full_cpy,0,t,Tp);
+  }
+  Full_cpy = Full_cpy - Full;
+  std::cout << " NormFull " << norm2(Full)<<std::endl;
+  std::cout << " NormDiff " << norm2(Full_cpy)<<std::endl;
+  Grid_finalize();
+}
diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc
new file mode 100644
index 00000000..e121f21b
--- /dev/null
+++ b/tests/forces/Test_gp_plaq_force.cc
@@ -0,0 +1,123 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_gp_rect_force.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG(&Grid);
+  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+  LatticeGaugeField U(&Grid);
+
+  SU3::HotConfiguration(pRNG,U);
+  
+  double beta = 1.0;
+  double c1   = 0.331;
+
+  //ConjugatePlaqPlusRectangleActionR Action(beta,c1);
+  ConjugateWilsonGaugeActionR Action(beta);
+  //WilsonGaugeActionR Action(beta);
+
+  ComplexD S    = Action.S(U);
+
+  // get the deriv of phidag MdagM phi with respect to "U"
+  LatticeGaugeField UdSdU(&Grid);
+
+  Action.deriv(U,UdSdU);
+
+  ////////////////////////////////////
+  // Modify the gauge field a little 
+  ////////////////////////////////////
+  RealD dt = 0.0001;
+
+  LatticeColourMatrix mommu(&Grid); 
+  LatticeColourMatrix forcemu(&Grid); 
+  LatticeGaugeField mom(&Grid); 
+  LatticeGaugeField Uprime(&Grid); 
+
+  for(int mu=0;mu<Nd;mu++){
+
+    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg
+
+    PokeIndex<LorentzIndex>(mom,mommu,mu);
+
+    // fourth order exponential approx
+    parallel_for(auto i=mom.begin();i<mom.end();i++){ // exp(pmu dt) * Umu
+      Uprime[i](mu) = U[i](mu) + mom[i](mu)*U[i](mu)*dt ;
+    }
+  }
+
+  ComplexD Sprime    = Action.S(Uprime);
+
+  //////////////////////////////////////////////
+  // Use derivative to estimate dS
+  //////////////////////////////////////////////
+
+  LatticeComplex dS(&Grid); dS = zero;
+
+  for(int mu=0;mu<Nd;mu++){
+
+    auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
+         mommu   = PeekIndex<LorentzIndex>(mom,mu);
+
+    // Update gauge action density
+    // U = exp(p dt) U
+    // dU/dt = p U
+    // so dSdt = trace( dUdt dSdU) = trace( p UdSdUmu ) 
+
+    dS = dS - trace(mommu*UdSdUmu)*dt*2.0;
+
+  }
+  ComplexD dSpred    = sum(dS);
+
+  std::cout << GridLogMessage << " S      "<<S<<std::endl;
+  std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
+  std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
+  std::cout << GridLogMessage << "pred dS "<< dSpred <<std::endl;
+  assert( fabs(real(Sprime-S-dSpred)) < 1.0e-2 ) ;
+  std::cout<< GridLogMessage << "Done" <<std::endl;
+  Grid_finalize();
+}
diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc
index 12b87610..087d0e77 100644
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -57,7 +57,8 @@ int main (int argc, char ** argv)
   SU3::HotConfiguration(pRNG,U);
   
   double beta = 1.0;
-  double c1   = -1.0/12.0;
+  double c1   = 0.331;
+
   ConjugatePlaqPlusRectangleActionR Action(beta,c1);
   //ConjugateWilsonGaugeActionR Action(beta);
   //WilsonGaugeActionR Action(beta);
diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc
index 9a4a2c1c..1c1f7768 100644
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -84,7 +84,7 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   // Modify the gauge field a little 
   ////////////////////////////////////
-  RealD dt = 0.0001;
+  RealD dt = 0.01;
 
   LatticeColourMatrix mommu(UGrid); 
   LatticeColourMatrix forcemu(UGrid); 
diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc
index 5bfdef72..e77a895c 100644
--- a/tests/forces/Test_wilson_force.cc
+++ b/tests/forces/Test_wilson_force.cc
@@ -47,7 +47,12 @@ int main (int argc, char ** argv)
   std::vector<int> seeds({1,2,3,4});
 
   GridParallelRNG          pRNG(&Grid);
-  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+  std::vector<int> vrand(4);
+  std::srand(std::time(0));
+  std::generate(vrand.begin(), vrand.end(), std::rand);
+  std::cout << GridLogMessage << vrand << std::endl;
+  pRNG.SeedFixedIntegers(vrand);
+  //pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
   LatticeFermion phi        (&Grid); gaussian(pRNG,phi);
   LatticeFermion Mphi       (&Grid); 
diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc
new file mode 100644
index 00000000..f7090845
--- /dev/null
+++ b/tests/forces/Test_wilsonclover_force.cc
@@ -0,0 +1,194 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid
+
+    Source file: ./tests/Test_wilson_force.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  std::vector<int> latt_size = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
+  std::vector<int> mpi_layout = GridDefaultMpi();
+
+  GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+  GridRedBlackCartesian RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  std::vector<int> seeds({1, 2, 30, 50});
+
+  GridParallelRNG pRNG(&Grid);
+
+  std::vector<int> vrand(4);
+  std::srand(std::time(0));
+  std::generate(vrand.begin(), vrand.end(), std::rand);
+  std::cout << GridLogMessage << vrand << std::endl;
+  pRNG.SeedFixedIntegers(vrand);
+  //pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion phi(&Grid);
+  gaussian(pRNG, phi);
+  LatticeFermion Mphi(&Grid);
+  LatticeFermion MphiPrime(&Grid);
+
+  LatticeGaugeField U(&Grid);
+
+  std::vector<int> site = {0, 0, 0, 0};
+  SU3::HotConfiguration(pRNG, U);
+  //SU3::ColdConfiguration(pRNG, U);// Clover term zero
+
+  ////////////////////////////////////
+  // Unmodified matrix element
+  ////////////////////////////////////
+  RealD mass = 0.1;
+  Real csw = 1.0;
+  WilsonCloverFermionR Dw(U, Grid, RBGrid, mass, csw, csw);
+  Dw.ImportGauge(U);
+  Dw.M(phi, Mphi);
+  ComplexD S = innerProduct(Mphi, Mphi); // Action : pdag MdagM p
+
+  // get the deriv of phidag MdagM phi with respect to "U"
+  LatticeGaugeField UdSdU(&Grid);
+  LatticeGaugeField tmp(&Grid);
+
+  ////////////////////////////////////////////
+  Dw.MDeriv(tmp, Mphi, phi, DaggerNo);
+  UdSdU = tmp;
+  Dw.MDeriv(tmp, phi, Mphi, DaggerYes);
+  UdSdU += tmp;
+  /////////////////////////////////////////////
+
+  ////////////////////////////////////
+  // Modify the gauge field a little
+  ////////////////////////////////////
+  RealD dt = 0.00005;
+  RealD Hmom = 0.0;
+  RealD Hmomprime = 0.0;
+  RealD Hmompp = 0.0;
+  LatticeColourMatrix mommu(&Grid);
+  LatticeColourMatrix forcemu(&Grid);
+  LatticeGaugeField mom(&Grid);
+  LatticeGaugeField Uprime(&Grid);
+
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    // Traceless antihermitian momentum; gaussian in lie alg
+    SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu);
+    Hmom -= real(sum(trace(mommu * mommu)));
+    PokeIndex<LorentzIndex>(mom, mommu, mu);
+
+    parallel_for(int ss = 0; ss < mom._grid->oSites(); ss++)
+    {
+      Uprime[ss]._internal[mu] = ProjectOnGroup(Exponentiate(mom[ss]._internal[mu], dt, 12) * U[ss]._internal[mu]);
+    }
+  }
+
+  std::cout << GridLogMessage << "Initial mom hamiltonian is " << Hmom << std::endl;
+
+  // New action
+  Dw.ImportGauge(Uprime);
+  Dw.M(phi, MphiPrime);
+  ComplexD Sprime = innerProduct(MphiPrime, MphiPrime);
+
+  //////////////////////////////////////////////
+  // Use derivative to estimate dS
+  //////////////////////////////////////////////
+
+  LatticeComplex dS(&Grid);
+  dS = zero;
+  LatticeComplex dSmom(&Grid);
+  dSmom = zero;
+  LatticeComplex dSmom2(&Grid);
+  dSmom2 = zero;
+
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    mommu = PeekIndex<LorentzIndex>(UdSdU, mu); // P_mu =
+    mommu = Ta(mommu) * 2.0;                    // Mom = (P_mu - P_mu^dag) - trace(P_mu - P_mu^dag)
+    PokeIndex<LorentzIndex>(UdSdU, mommu, mu);  // UdSdU_mu = Mom
+  }
+
+  std::cout << GridLogMessage << "Antihermiticity tests" << std::endl;
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    mommu = PeekIndex<LorentzIndex>(mom, mu);
+    std::cout << GridLogMessage << " Mommu  " << norm2(mommu) << std::endl;
+    mommu = mommu + adj(mommu);
+    std::cout << GridLogMessage << " Mommu + Mommudag " << norm2(mommu) << std::endl;
+    mommu = PeekIndex<LorentzIndex>(UdSdU, mu);
+    std::cout << GridLogMessage << " dsdumu  " << norm2(mommu) << std::endl;
+    mommu = mommu + adj(mommu);
+    std::cout << GridLogMessage << " dsdumu + dag  " << norm2(mommu) << std::endl;
+    std::cout << "" << std::endl;
+  }
+  /////////////////////////////////////////////////////
+
+  for (int mu = 0; mu < Nd; mu++)
+  {
+    forcemu = PeekIndex<LorentzIndex>(UdSdU, mu);
+    mommu = PeekIndex<LorentzIndex>(mom, mu);
+
+    // Update PF action density
+    dS = dS + trace(mommu * forcemu) * dt;
+
+    dSmom = dSmom - trace(mommu * forcemu) * dt;
+    dSmom2 = dSmom2 - trace(forcemu * forcemu) * (0.25 * dt * dt);
+
+    // Update mom action density
+    mommu = mommu + forcemu * (dt * 0.5);
+
+    Hmomprime -= real(sum(trace(mommu * mommu)));
+  }
+
+  ComplexD dSpred = sum(dS);
+  ComplexD dSm = sum(dSmom);
+  ComplexD dSm2 = sum(dSmom2);
+
+  std::cout << GridLogMessage << "Initial mom hamiltonian is " << Hmom << std::endl;
+  std::cout << GridLogMessage << "Final   mom hamiltonian is " << Hmomprime << std::endl;
+  std::cout << GridLogMessage << "Delta   mom hamiltonian is " << Hmomprime - Hmom << std::endl;
+
+  std::cout << GridLogMessage << " S      " << S << std::endl;
+  std::cout << GridLogMessage << " Sprime " << Sprime << std::endl;
+  std::cout << GridLogMessage << "dS (S' - S)          :" << Sprime - S << std::endl;
+  std::cout << GridLogMessage << "predict dS (force)   :" << dSpred << std::endl;
+  std::cout << GridLogMessage << "dSm " << dSm << std::endl;
+  std::cout << GridLogMessage << "dSm2" << dSm2 << std::endl;
+
+  std::cout << GridLogMessage << "Total dS    " << Hmomprime - Hmom + Sprime - S << std::endl;
+
+  assert(fabs(real(Sprime - S - dSpred)) < 1.0);
+
+  std::cout << GridLogMessage << "Done" << std::endl;
+  Grid_finalize();
+}
diff --git a/tests/hadrons/Makefile.am b/tests/hadrons/Makefile.am
index c8ec1612..1e6a9fb4 100644
--- a/tests/hadrons/Makefile.am
+++ b/tests/hadrons/Makefile.am
@@ -1,3 +1,3 @@
-AM_LDFLAGS += -L../../extras/Hadrons
+AM_LDFLAGS += -L../../Hadrons
 
 include Make.inc
diff --git a/tests/hadrons/Test_QED.cc b/tests/hadrons/Test_QED.cc
new file mode 100644
index 00000000..9053c6ed
--- /dev/null
+++ b/tests/hadrons/Test_QED.cc
@@ -0,0 +1,265 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_QED.cc
+
+Copyright (C) 2015-2018
+
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ Author: Vera Guelpers    <v.m.guelpers@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Application.hpp>
+#include <Hadrons/Modules.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+
+int main(int argc, char *argv[])
+{
+    // initialization //////////////////////////////////////////////////////////
+    Grid_init(&argc, &argv);
+    HadronsLogError.Active(GridLogError.isActive());
+    HadronsLogWarning.Active(GridLogWarning.isActive());
+    HadronsLogMessage.Active(GridLogMessage.isActive());
+    HadronsLogIterative.Active(GridLogIterative.isActive());
+    HadronsLogDebug.Active(GridLogDebug.isActive());
+    LOG(Message) << "Grid initialized" << std::endl;
+
+
+    // run setup ///////////////////////////////////////////////////////////////
+    Application              application;
+    std::vector<std::string> flavour = {"h"}; //{"l", "s", "c1", "c2", "c3"};
+    std::vector<double>      mass    = {.2}; //{.01, .04, .2  , .25 , .3  };
+
+    unsigned int  nt    = GridDefaultLatt()[Tp];
+    
+    // global parameters
+    Application::GlobalPar globalPar;
+    globalPar.trajCounter.start = 1500;
+    globalPar.trajCounter.end   = 1520;
+    globalPar.trajCounter.step  = 20;
+    globalPar.runId             = "test";
+    application.setPar(globalPar);
+    // gauge field
+    application.createModule<MGauge::Unit>("gauge");
+    // pt source
+    MSource::Point::Par ptPar;
+    ptPar.position = "0 0 0 0";
+    application.createModule<MSource::Point>("pt", ptPar);
+    // sink
+    MSink::Point::Par sinkPar;
+    sinkPar.mom = "0 0 0";
+    application.createModule<MSink::ScalarPoint>("sink", sinkPar);
+    
+    // set fermion boundary conditions to be periodic space, antiperiodic time.
+    std::string boundary = "1 1 1 -1";
+    std::string twist    = "0. 0. 0. 0.";
+
+    //stochastic photon field
+    MGauge::StochEm::Par photonPar;
+    photonPar.gauge = PhotonR::Gauge::feynman;
+    photonPar.zmScheme = PhotonR::ZmScheme::qedL;
+    application.createModule<MGauge::StochEm>("ph_field", photonPar);
+
+
+
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    {
+        // actions
+        MAction::DWF::Par actionPar;
+        actionPar.gauge = "gauge";
+        actionPar.Ls    = 8;
+        actionPar.M5    = 1.8;
+        actionPar.mass  = mass[i];
+        actionPar.boundary = boundary;
+        actionPar.twist = "0. 0. 0. 0.";
+        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
+
+        
+        // solvers
+        MSolver::RBPrecCG::Par solverPar;
+        solverPar.action       = "DWF_" + flavour[i];
+        solverPar.residual     = 1.0e-8;
+        solverPar.maxIteration = 10000;
+        application.createModule<MSolver::RBPrecCG>("CG_" + flavour[i],
+                                                    solverPar);
+        
+        // propagators
+        MFermion::GaugeProp::Par quarkPar;
+        quarkPar.solver = "CG_" + flavour[i];
+        quarkPar.source = "pt";
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i],
+							 quarkPar);
+
+
+	//seq sources with tadpole insertion
+        MSource::SeqConserved::Par seqPar_T;
+        seqPar_T.q         = "Qpt_" + flavour[i] + "_5d";
+        seqPar_T.action    = "DWF_" + flavour[i];
+        seqPar_T.tA        = 0;
+        seqPar_T.tB        = nt-1;
+        seqPar_T.curr_type = Current::Tadpole;
+	seqPar_T.mu_min	   = 0;
+	seqPar_T.mu_max	   = 3;
+        seqPar_T.mom       = "0. 0. 0. 0.";
+        application.createModule<MSource::SeqConserved>("Qpt_" + flavour[i]
+							 + "_seq_T", seqPar_T);
+        // seq propagator with tadpole insertion
+        MFermion::GaugeProp::Par quarkPar_seq_T;
+        quarkPar_seq_T.solver = "CG_" + flavour[i];
+        quarkPar_seq_T.source = "Qpt_" + flavour[i] + "_seq_T";
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i]
+							+ "_seq_T" + flavour[i],
+							quarkPar_seq_T);
+
+
+
+	//seq sources with conserved vector and photon insertion
+        MSource::SeqConserved::Par seqPar_V;
+        seqPar_V.q         = "Qpt_" + flavour[i] + "_5d";
+        seqPar_V.action    = "DWF_" + flavour[i];
+        seqPar_V.tA        = 0;
+        seqPar_V.tB        = nt-1;
+        seqPar_V.curr_type = Current::Vector;
+	seqPar_V.mu_min	   = 0;
+	seqPar_V.mu_max	   = 3;
+        seqPar_V.mom       = "0. 0. 0. 0.";
+	seqPar_V.photon	   = "ph_field";
+        application.createModule<MSource::SeqConserved>("Qpt_" + flavour[i] 
+						    + "_seq_V_ph", seqPar_V);
+        // seq propagator with conserved vector and photon insertion
+        MFermion::GaugeProp::Par quarkPar_seq_V;
+        quarkPar_seq_V.solver = "CG_" + flavour[i];
+        quarkPar_seq_V.source = "Qpt_" + flavour[i] + "_seq_V_ph";
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i] 
+						+ "_seq_V_ph_" + flavour[i], 
+							quarkPar_seq_V);
+
+
+
+	//double seq sources with conserved vector and photon insertion
+	//(for self energy)
+        MSource::SeqConserved::Par seqPar_VV;
+        seqPar_VV.q         = "Qpt_" + flavour[i] + "_seq_V_ph_" 
+				+ flavour[i] + "_5d";
+        seqPar_VV.action    = "DWF_" + flavour[i];
+        seqPar_VV.tA        = 0;
+        seqPar_VV.tB        = nt-1;
+        seqPar_VV.curr_type = Current::Vector;
+	seqPar_VV.mu_min    = 0;
+	seqPar_VV.mu_max    = 3;
+        seqPar_VV.mom       = "0. 0. 0. 0.";
+	seqPar_VV.photon    = "ph_field";
+        application.createModule<MSource::SeqConserved>("Qpt_" + flavour[i] 
+						+ "_seq_V_ph" + flavour[i] 
+						+ "_seq_V_ph", seqPar_VV);
+        //double seq propagator with conserved vector and photon insertion
+        MFermion::GaugeProp::Par quarkPar_seq_VV;
+        quarkPar_seq_VV.solver = "CG_" + flavour[i];
+        quarkPar_seq_VV.source = "Qpt_" + flavour[i] + "_seq_V_ph" 
+						+ flavour[i] + "_seq_V_ph";
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i] 
+						+ "_seq_V_ph_" + flavour[i] 
+						+ "_seq_V_ph_" + flavour[i], 
+							quarkPar_seq_VV);
+
+
+
+    }
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    for (unsigned int j = i; j < flavour.size(); ++j)
+    {
+        //2pt function contraction
+        MContraction::Meson::Par mesPar;
+        mesPar.output  = "QED/pt_" + flavour[i] + flavour[j];
+        mesPar.q1      = "Qpt_" + flavour[i];
+        mesPar.q2      = "Qpt_" + flavour[j];
+        mesPar.gammas  = "(Gamma5 Gamma5)";
+        mesPar.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_pt_"
+                                                      + flavour[i] + flavour[j],
+                                                      mesPar);
+
+
+
+        //tadpole contraction
+        MContraction::Meson::Par mesPar_seq_T;
+        mesPar_seq_T.output  = "QED/tadpole_pt_" + flavour[i] + "_T_" 
+					+ flavour[i] + "__" + flavour[j];
+        mesPar_seq_T.q1      = "Qpt_" + flavour[i] + "_seq_T" + flavour[i];
+        mesPar_seq_T.q2      = "Qpt_" + flavour[j];
+        mesPar_seq_T.gammas  = "(Gamma5 Gamma5)";
+        mesPar_seq_T.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_tadpole_pt_" +
+                                                      flavour[i] + "_seq_T" 
+						      + flavour[i] + flavour[j],
+                                                      mesPar_seq_T);
+
+
+
+        //photon exchange contraction
+        MContraction::Meson::Par mesPar_seq_E;
+        mesPar_seq_E.output  = "QED/exchange_pt_" + flavour[i] + "_V_ph_" 
+				+ flavour[i] + "__" + flavour[j] + "_V_ph_"
+				+ flavour[j];
+        mesPar_seq_E.q1      = "Qpt_" + flavour[i] + "_seq_V_ph_" + flavour[i];
+        mesPar_seq_E.q2      = "Qpt_" + flavour[j] + "_seq_V_ph_" + flavour[j];
+        mesPar_seq_E.gammas  = "(Gamma5 Gamma5)";
+        mesPar_seq_E.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_exchange_pt_" 
+					+ flavour[i] + "_seq_V_ph_" + flavour[i] 
+					+ flavour[j] + "_seq_V_ph_" + flavour[j],
+                                                      mesPar_seq_E);
+
+
+
+        //self energy contraction
+        MContraction::Meson::Par mesPar_seq_S;
+        mesPar_seq_S.output  = "QED/selfenergy_pt_" + flavour[i] + "_V_ph_" 
+				+ flavour[i] + "_V_ph_" + flavour[i] + "__" 
+				+  flavour[j];
+        mesPar_seq_S.q1      = "Qpt_" + flavour[i] + "_seq_V_ph_" + flavour[i] 
+				+ "_seq_V_ph_" + flavour[i];
+        mesPar_seq_S.q2      = "Qpt_" + flavour[j];
+        mesPar_seq_S.gammas  = "(Gamma5 Gamma5)";
+        mesPar_seq_S.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_selfenergy_pt_" 
+						    + flavour[i] + "_seq_V_ph_" 
+						    + flavour[i] + "_seq_V_ph_" 
+						    + flavour[i] + flavour[j],
+                                                       mesPar_seq_S);
+
+    }
+
+
+
+    
+    // execution
+    application.saveParameterFile("QED.xml");
+    application.run();
+    
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
diff --git a/tests/hadrons/Test_diskvector.cc b/tests/hadrons/Test_diskvector.cc
new file mode 100644
index 00000000..10bc4db1
--- /dev/null
+++ b/tests/hadrons/Test_diskvector.cc
@@ -0,0 +1,114 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_diskvector.cc
+
+Copyright (C) 2015-2018
+
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#define DV_DEBUG
+#include <Hadrons/DiskVector.hpp>
+
+using namespace Grid;
+using namespace Grid::QCD;
+using namespace Grid::Hadrons;
+
+GRID_SERIALIZABLE_ENUM(Enum, undef, red, 1, blue, 2, green, 3);
+
+class Object: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(Object,
+                                  Enum, e,
+                                  SpinColourMatrix, scm);
+};
+
+#ifdef HAVE_HDF5
+typedef Hdf5Reader TestReader;
+typedef Hdf5Writer TestWriter;
+#else
+typedef BinaryReader TestReader;
+typedef BinaryWriter TestWriter;
+#endif
+
+int main(int argc, char *argv[])
+{
+    Grid_init(&argc, &argv);
+
+    GridSerialRNG rng;
+    Object        obj, v2w, v2r, v13w, v13r;
+
+    SerializableDiskVector<Object, TestReader, TestWriter> v("diskvector_test", 1000, 4);
+
+    obj.e = Enum::red;
+    random(rng, obj.scm);
+    v[32] = obj;
+    random(rng, obj.scm);
+    v[2] = obj;
+    v2w  = obj;
+    random(rng, obj.scm);
+    v[6] = obj;
+    random(rng, obj.scm);
+    v[7] = obj;
+    random(rng, obj.scm);
+    v[8] = obj;
+    random(rng, obj.scm);
+    v[9] = obj;
+    random(rng, obj.scm);
+    v[10] = obj;
+    random(rng, obj.scm);
+    v[11] = obj;
+    random(rng, obj.scm);
+    v[12] = obj;
+    random(rng, obj.scm);
+    v[13] = obj;
+    v13w  = obj;
+    random(rng, obj.scm);
+    v[14] = obj;
+    random(rng, obj.scm);
+    v[15] = obj;
+
+    v2r = v[2];
+    LOG(Message) << "v[2] correct? " 
+                 << ((v2r == v2w) ? "yes" : "no" ) << std::endl;
+    v13r = v[13];
+    LOG(Message) << "v[13] correct? " 
+                 << ((v13r == v13w) ? "yes" : "no" ) << std::endl;
+    LOG(Message) << "hit ratio " << v.hitRatio() << std::endl;
+
+    EigenDiskVector<ComplexD>         w("eigendiskvector_test", 1000, 4);
+    EigenDiskVector<ComplexD>::Matrix m,n;
+
+    w[2] = EigenDiskVectorMat<ComplexD>::Random(2000, 2000);
+    m    = w[2];
+    w[3] = EigenDiskVectorMat<ComplexD>::Random(2000, 2000);
+    w[4] = EigenDiskVectorMat<ComplexD>::Random(2000, 2000);
+    w[5] = EigenDiskVectorMat<ComplexD>::Random(2000, 2000);
+    w[6] = EigenDiskVectorMat<ComplexD>::Random(2000, 2000);
+    w[7] = EigenDiskVectorMat<ComplexD>::Random(2000, 2000);
+    n    = w[2];
+    LOG(Message) << "w[2] correct? " 
+                 << ((m == n) ? "yes" : "no" ) << std::endl;
+    LOG(Message) << "hit ratio " << w.hitRatio() << std::endl;
+
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
diff --git a/tests/hadrons/Test_free_prop.cc b/tests/hadrons/Test_free_prop.cc
new file mode 100644
index 00000000..abf05e33
--- /dev/null
+++ b/tests/hadrons/Test_free_prop.cc
@@ -0,0 +1,246 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_free_prop.cc
+
+Copyright (C) 2015-2018
+
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ Author: Vera Guelpers    <v.m.guelpers@soton.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Application.hpp>
+#include <Hadrons/Modules.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+
+int main(int argc, char *argv[])
+{
+    // initialization //////////////////////////////////////////////////////////
+    Grid_init(&argc, &argv);
+    HadronsLogError.Active(GridLogError.isActive());
+    HadronsLogWarning.Active(GridLogWarning.isActive());
+    HadronsLogMessage.Active(GridLogMessage.isActive());
+    HadronsLogIterative.Active(GridLogIterative.isActive());
+    HadronsLogDebug.Active(GridLogDebug.isActive());
+    LOG(Message) << "Grid initialized" << std::endl;
+
+
+    // run setup ///////////////////////////////////////////////////////////////
+    Application              application;
+    std::vector<std::string> flavour = {"h"}; //{"l", "s", "c1", "c2", "c3"};
+    std::vector<double>      mass    = {.2}; //{.01, .04, .2  , .25 , .3  };
+    std::vector<std::string> lepton_flavour    = {"mu"};
+    std::vector<double>      lepton_mass    = {.2};
+
+    unsigned int  nt    = GridDefaultLatt()[Tp];
+    
+    // global parameters
+    Application::GlobalPar globalPar;
+    globalPar.trajCounter.start = 1500;
+    globalPar.trajCounter.end   = 1520;
+    globalPar.trajCounter.step  = 20;
+    globalPar.runId             = "test";
+    application.setPar(globalPar);
+    // gauge field
+    application.createModule<MGauge::Unit>("gauge");
+    // unit gauge field for lepton 
+    application.createModule<MGauge::Unit>("free_gauge");
+    // pt source
+    MSource::Point::Par ptPar;
+    ptPar.position = "0 0 0 0";
+    application.createModule<MSource::Point>("pt", ptPar);
+    // sink
+    MSink::Point::Par sinkPar;
+    sinkPar.mom = "0 0 0";
+    application.createModule<MSink::ScalarPoint>("sink", sinkPar);
+    
+    // set fermion boundary conditions to be periodic space, antiperiodic time.
+    std::string boundary = "1 1 1 -1";
+
+
+    //Propagators from FFT and Feynman rules
+    for (unsigned int i = 0; i < lepton_mass.size(); ++i)
+    {
+        //DWF actions
+        MAction::DWF::Par actionPar_lep;
+        actionPar_lep.gauge = "free_gauge";
+        actionPar_lep.Ls    = 8;
+        actionPar_lep.M5    = 1.8;
+        actionPar_lep.mass  = lepton_mass[i];
+        actionPar_lep.boundary = boundary;
+        application.createModule<MAction::DWF>("free_DWF_" + lepton_flavour[i], actionPar_lep);
+
+        //DWF free propagators
+        MFermion::FreeProp::Par freePar;
+        freePar.source = "pt";
+	freePar.action = "free_DWF_" + lepton_flavour[i];
+	freePar.twist = "0 0 0 0.5";
+        freePar.mass = lepton_mass[i];
+        application.createModule<MFermion::FreeProp>("Lpt_" + lepton_flavour[i],
+							 freePar);
+
+        //Wilson actions
+        MAction::Wilson::Par actionPar_lep_W;
+        actionPar_lep_W.gauge = "free_gauge";
+        actionPar_lep_W.mass  = lepton_mass[i];
+        actionPar_lep_W.boundary = boundary;
+        application.createModule<MAction::Wilson>("free_W_" + lepton_flavour[i], actionPar_lep_W);
+
+        //Wilson free propagators
+        MFermion::FreeProp::Par freePar_W;
+        freePar_W.source = "pt";
+	freePar_W.action = "free_W_" + lepton_flavour[i];
+	freePar_W.twist = "0 0 0 0.5";
+        freePar_W.mass = lepton_mass[i];
+        application.createModule<MFermion::FreeProp>("W_Lpt_" + lepton_flavour[i],
+							 freePar_W);
+
+
+    }
+
+
+
+    //Propagators from inversion
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    {
+        //DWF actions
+        MAction::DWF::Par actionPar;
+        actionPar.gauge = "gauge";
+        actionPar.Ls    = 8;
+        actionPar.M5    = 1.8;
+        actionPar.mass  = mass[i];
+        actionPar.boundary = boundary;
+        application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
+        
+        // solvers
+        MSolver::RBPrecCG::Par solverPar;
+        solverPar.action       = "DWF_" + flavour[i];
+        solverPar.residual     = 1.0e-8;
+        solverPar.maxIteration = 10000;
+        application.createModule<MSolver::RBPrecCG>("CG_" + flavour[i],
+                                                    solverPar);
+        
+        //DWF propagators
+        MFermion::GaugeProp::Par quarkPar;
+        quarkPar.solver = "CG_" + flavour[i];
+        quarkPar.source = "pt";
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i],
+							 quarkPar);
+
+
+
+        //Wilson actions
+        MAction::Wilson::Par actionPar_W;
+        actionPar_W.gauge = "gauge";
+        actionPar_W.mass  = mass[i];
+        actionPar_W.boundary = boundary;
+        application.createModule<MAction::Wilson>("W_" + flavour[i], actionPar_W);
+
+        
+        // solvers
+        MSolver::RBPrecCG::Par solverPar_W;
+        solverPar_W.action       = "W_" + flavour[i];
+        solverPar_W.residual     = 1.0e-8;
+        solverPar_W.maxIteration = 10000;
+        application.createModule<MSolver::RBPrecCG>("W_CG_" + flavour[i],
+                                                    solverPar_W);
+        
+        //Wilson propagators
+        MFermion::GaugeProp::Par quarkPar_W;
+        quarkPar_W.solver = "W_CG_" + flavour[i];
+        quarkPar_W.source = "pt";
+        application.createModule<MFermion::GaugeProp>("W_Qpt_" + flavour[i],
+							 quarkPar_W);
+
+    }
+
+
+    //2pt contraction for Propagators from FFT and Feynman rules
+    for (unsigned int i = 0; i < lepton_flavour.size(); ++i)
+    for (unsigned int j = i; j < lepton_flavour.size(); ++j)
+    {
+        //2pt function contraction DWF
+        MContraction::Meson::Par freemesPar;
+        freemesPar.output  = "2pt_free/DWF_L_pt_" + lepton_flavour[i] + lepton_flavour[j];
+        freemesPar.q1      = "Lpt_" + lepton_flavour[i];
+        freemesPar.q2      = "Lpt_" + lepton_flavour[j];
+        freemesPar.gammas  = "(Gamma5 Gamma5)";
+        freemesPar.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_L_pt_"
+                                                      + lepton_flavour[i] + lepton_flavour[j],
+                                                      freemesPar);
+
+        //2pt function contraction Wilson
+        MContraction::Meson::Par freemesPar_W;
+        freemesPar_W.output  = "2pt_free/W_L_pt_" + lepton_flavour[i] + lepton_flavour[j];
+        freemesPar_W.q1      = "W_Lpt_" + lepton_flavour[i];
+        freemesPar_W.q2      = "W_Lpt_" + lepton_flavour[j];
+        freemesPar_W.gammas  = "(Gamma5 Gamma5)";
+        freemesPar_W.sink    = "sink";
+        application.createModule<MContraction::Meson>("W_meson_L_pt_"
+                                                      + lepton_flavour[i] + lepton_flavour[j],
+                                                      freemesPar_W);
+
+    }
+
+    //2pt contraction for Propagators from inverion
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    for (unsigned int j = i; j < flavour.size(); ++j)
+    {
+        //2pt function contraction DWF
+        MContraction::Meson::Par mesPar;
+        mesPar.output  = "2pt_free/DWF_pt_" + flavour[i] + flavour[j];
+        mesPar.q1      = "Qpt_" + flavour[i];
+        mesPar.q2      = "Qpt_" + flavour[j];
+        mesPar.gammas  = "(Gamma5 Gamma5)";
+        mesPar.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_pt_"
+                                                      + flavour[i] + flavour[j],
+                                                      mesPar);
+
+
+        //2pt function contraction Wilson
+        MContraction::Meson::Par mesPar_W;
+        mesPar_W.output  = "2pt_free/W_pt_" + flavour[i] + flavour[j];
+        mesPar_W.q1      = "W_Qpt_" + flavour[i];
+        mesPar_W.q2      = "W_Qpt_" + flavour[j];
+        mesPar_W.gammas  = "(Gamma5 Gamma5)";
+        mesPar_W.sink    = "sink";
+        application.createModule<MContraction::Meson>("W_meson_pt_"
+                                                      + flavour[i] + flavour[j],
+                                                      mesPar_W);
+
+    }
+
+
+
+    // execution
+    application.saveParameterFile("free_prop.xml");
+    application.run();
+    
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
diff --git a/tests/hadrons/Test_hadrons.hpp b/tests/hadrons/Test_hadrons.hpp
index 0265f5a6..7f07bea5 100644
--- a/tests/hadrons/Test_hadrons.hpp
+++ b/tests/hadrons/Test_hadrons.hpp
@@ -1,31 +1,33 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
+/*************************************************************************************
 
- Source file: tests/hadrons/Test_hadrons.hpp
+Grid physics library, www.github.com/paboyle/Grid 
 
- Copyright (C) 2017
+Source file: Tests/Hadrons/Test_hadrons.hpp
+
+Copyright (C) 2015-2018
 
  Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
 
-#include <Grid/Hadrons/Application.hpp>
+#include <Hadrons/Application.hpp>
+#include <Hadrons/Modules.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -49,7 +51,7 @@ using namespace Hadrons;
     globalPar.trajCounter.start    = 1500;      \
     globalPar.trajCounter.end      = 1520;      \
     globalPar.trajCounter.step     = 20;        \
-    globalPar.seed                 = "1 2 3 4"; \
+    globalPar.runId                = "test";    \
     globalPar.genetic.maxGen       = 1000;      \
     globalPar.genetic.maxCstGen    = 200;       \
     globalPar.genetic.popSize      = 20;        \
@@ -124,6 +126,7 @@ inline void makeWilsonAction(Application &application, std::string actionName,
         actionPar.gauge = gaugeField;
         actionPar.mass  = mass;
         actionPar.boundary = boundary;
+        actionPar.twist = "0. 0. 0. 0.";
         application.createModule<MAction::Wilson>(actionName, actionPar);
     }
 }
@@ -152,6 +155,7 @@ inline void makeDWFAction(Application &application, std::string actionName,
         actionPar.M5    = M5;
         actionPar.mass  = mass;
         actionPar.boundary = boundary;
+        actionPar.twist = "0. 0. 0. 0.";
         application.createModule<MAction::DWF>(actionName, actionPar);
     }
 }
@@ -176,8 +180,9 @@ inline void makeRBPrecCGSolver(Application &application, std::string &solverName
     if (!(VirtualMachine::getInstance().hasModule(solverName)))
     {
         MSolver::RBPrecCG::Par solverPar;
-        solverPar.action   = actionName;
-        solverPar.residual = residual;
+        solverPar.action       = actionName;
+        solverPar.residual     = residual;
+        solverPar.maxIteration = 10000;
         application.createModule<MSolver::RBPrecCG>(solverName,
                                                     solverPar);
     }
@@ -263,7 +268,8 @@ inline void makeConservedSequentialSource(Application &application,
         seqPar.tA        = tS;
         seqPar.tB        = tS;
         seqPar.curr_type = curr;
-        seqPar.mu        = mu;
+        seqPar.mu_min    = mu;
+        seqPar.mu_min    = mu;
         seqPar.mom       = mom;
         application.createModule<MSource::SeqConserved>(srcName, seqPar);
     }
diff --git a/tests/hadrons/Test_hadrons_3pt_contractions.cc b/tests/hadrons/Test_hadrons_3pt_contractions.cc
index 1d38b4f8..09c478bb 100644
--- a/tests/hadrons/Test_hadrons_3pt_contractions.cc
+++ b/tests/hadrons/Test_hadrons_3pt_contractions.cc
@@ -1,29 +1,30 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
- 
- Source file: tests/hadrons/Test_hadrons_3pt_contractions.cc
- 
- Copyright (C) 2017
- 
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_hadrons_3pt_contractions.cc
+
+Copyright (C) 2015-2018
+
  Author: Andrew Lawson <andrew.lawson1991@gmail.com>
- 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
 
 #include "Test_hadrons.hpp"
 
diff --git a/tests/hadrons/Test_hadrons_conserved_current.cc b/tests/hadrons/Test_hadrons_conserved_current.cc
index e91d8245..b8345af7 100644
--- a/tests/hadrons/Test_hadrons_conserved_current.cc
+++ b/tests/hadrons/Test_hadrons_conserved_current.cc
@@ -1,29 +1,30 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
- 
- Source file: tests/hadrons/Test_hadrons_conserved_current.cc
- 
- Copyright (C) 2017
- 
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_hadrons_conserved_current.cc
+
+Copyright (C) 2015-2018
+
  Author: Andrew Lawson <andrew.lawson1991@gmail.com>
- 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
 
 #include "Test_hadrons.hpp"
 
diff --git a/tests/hadrons/Test_hadrons_meson_3pt.cc b/tests/hadrons/Test_hadrons_meson_3pt.cc
index 382c39d4..741e2a7c 100644
--- a/tests/hadrons/Test_hadrons_meson_3pt.cc
+++ b/tests/hadrons/Test_hadrons_meson_3pt.cc
@@ -1,31 +1,33 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
- 
- Source file: tests/hadrons/Test_hadrons_meson_3pt.cc
- 
- Copyright (C) 2015
- 
- Author: Antonin Portelli <antonin.portelli@me.com>
- 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+/*************************************************************************************
 
-#include <Grid/Hadrons/Application.hpp>
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_hadrons_meson_3pt.cc
+
+Copyright (C) 2015-2018
+
+ Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Application.hpp>
+#include <Hadrons/Modules.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -52,7 +54,7 @@ int main(int argc, char *argv[])
     globalPar.trajCounter.start    = 1500;
     globalPar.trajCounter.end      = 1520;
     globalPar.trajCounter.step     = 20;
-    globalPar.seed                 = "1 2 3 4";
+    globalPar.runId                = "test";
     globalPar.genetic.maxGen       = 1000;
     globalPar.genetic.maxCstGen    = 200;
     globalPar.genetic.popSize      = 20;
@@ -64,6 +66,7 @@ int main(int argc, char *argv[])
     
     // set fermion boundary conditions to be periodic space, antiperiodic time.
     std::string boundary = "1 1 1 -1";
+    std::string twist = "0. 0. 0. 0.";
 
     // sink
     MSink::Point::Par sinkPar;
@@ -78,12 +81,14 @@ int main(int argc, char *argv[])
         actionPar.M5    = 1.8;
         actionPar.mass  = mass[i];
         actionPar.boundary = boundary;
+        actionPar.twist = twist;
         application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
         
         // solvers
         MSolver::RBPrecCG::Par solverPar;
-        solverPar.action   = "DWF_" + flavour[i];
-        solverPar.residual = 1.0e-8;
+        solverPar.action       = "DWF_" + flavour[i];
+        solverPar.residual     = 1.0e-8;
+        solverPar.maxIteration = 10000;
         application.createModule<MSolver::RBPrecCG>("CG_" + flavour[i],
                                                     solverPar);
     }
diff --git a/tests/hadrons/Test_hadrons_meson_conserved_3pt.cc b/tests/hadrons/Test_hadrons_meson_conserved_3pt.cc
index 3a887cc2..1246de02 100644
--- a/tests/hadrons/Test_hadrons_meson_conserved_3pt.cc
+++ b/tests/hadrons/Test_hadrons_meson_conserved_3pt.cc
@@ -1,29 +1,30 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
+/*************************************************************************************
 
- Source file: tests/hadrons/Test_hadrons_meson_conserved_3pt.cc
+Grid physics library, www.github.com/paboyle/Grid 
 
- Copyright (C) 2017
+Source file: Tests/Hadrons/Test_hadrons_meson_conserved_3pt.cc
+
+Copyright (C) 2015-2018
 
  Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
 
 #include "Test_hadrons.hpp"
 
diff --git a/tests/hadrons/Test_hadrons_quark.cc b/tests/hadrons/Test_hadrons_quark.cc
deleted file mode 100644
index 7b7bd28a..00000000
--- a/tests/hadrons/Test_hadrons_quark.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
-
- Source file: tests/hadrons/Test_hadrons_quark.cc
-
- Copyright (C) 2017
-
- Author: Andrew Lawson <andrew.lawson1991@gmail.com>
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
-
-#include "Test_hadrons.hpp"
-#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
-
-using namespace Grid;
-using namespace QCD;
-using namespace Hadrons;
-
-/*******************************************************************************
- * Unit test functions within Quark module.
- ******************************************************************************/
-
-// Alternative 4D & 5D projections
-template<class vobj>
-inline void make_4D_with_gammas(Lattice<vobj> &in_5d, Lattice<vobj> &out_4d, int Ls)
-{
-    GridBase *_grid(out_4d.Grid());
-    Lattice<vobj> tmp(_grid);
-    Gamma G5(Gamma::Algebra::Gamma5);
-
-    ExtractSlice(tmp, in_5d, 0, 0);
-    out_4d = 0.5 * (tmp - G5*tmp);
-    ExtractSlice(tmp, in_5d, Ls - 1, 0);
-    out_4d += 0.5 * (tmp + G5*tmp);
-}
-
-template<class vobj>
-inline void make_5D_with_gammas(Lattice<vobj> &in_4d, Lattice<vobj> &out_5d, int Ls)
-{
-    out_5d = Zero();
-    Gamma G5(Gamma::Algebra::Gamma5);
-    GridBase *_grid(in_4d.Grid());
-    Lattice<vobj> tmp(_grid);
-
-    tmp = 0.5 * (in_4d + G5*in_4d);
-    InsertSlice(tmp, out_5d, 0, 0);
-    tmp = 0.5 * (in_4d - G5*in_4d);
-    InsertSlice(tmp, out_5d, Ls - 1, 0);
-}
-
-int main(int argc, char **argv)
-{
-    /***************************************************************************
-     * Initialisation.
-     **************************************************************************/
-    Grid_init(&argc, &argv);
-
-    std::vector<int> latt_size   = GridDefaultLatt();
-    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-    std::vector<int> mpi_layout  = GridDefaultMpi();
-
-    const int Ls = 8;
-
-    GridCartesian   UGrid(latt_size,simd_layout,mpi_layout);
-    GridCartesian   *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, &UGrid);
-    GridSerialRNG   sRNG;
-    GridParallelRNG pRNG(&UGrid);
-
-    std::vector<int> seeds4({1,2,3,4});
-    std::vector<int> seeds5({5,6,7,8});
-    GridParallelRNG  rng4(&UGrid);
-    GridParallelRNG  rng5(FGrid);
-    rng4.SeedFixedIntegers(seeds4);
-    rng5.SeedFixedIntegers(seeds5);
-
-    /***************************************************************************
-     * Build a 4D random source, and convert it to 5D.
-     **************************************************************************/
-    LatticeFermion test4(&UGrid);
-    LatticeFermion test5(FGrid);
-    LatticeFermion check5(FGrid);
-
-    gaussian(rng4, test4);
-    make_5D(test4, test5, Ls);
-    make_5D_with_gammas(test4, check5, Ls);
-    test5 -= check5;
-    std::cout << "4D -> 5D comparison, diff = " << Grid::sqrt(norm2(test5)) << std::endl;
-
-    /***************************************************************************
-     * Build a 5D random source, and project down to 4D.
-     **************************************************************************/
-    LatticeFermion check4(&UGrid);
-    gaussian(rng5, test5);
-    check5 = test5;
-
-    make_4D(test5, test4, Ls);
-    make_4D_with_gammas(check5, check4, Ls);
-    test4 -= check4;
-    std::cout << "5D -> 4D comparison, diff = " << Grid::sqrt(norm2(test4)) << std::endl;
-
-    /***************************************************************************
-     * Convert a propagator to a fermion & back.
-     **************************************************************************/
-    LatticeFermion ferm(&UGrid);
-    LatticePropagator prop(&UGrid), ref(&UGrid);
-    gaussian(rng4, prop);
-
-    // Define variables for sanity checking a single site.
-    typename SpinColourVector::scalar_object fermSite;
-    typename SpinColourMatrix::scalar_object propSite;
-    std::vector<int> site(Nd, 0);
-
-    for (int s = 0; s < Ns; ++s)
-    for (int c = 0; c < Nc; ++c)
-    {
-        ref = prop;
-        PropToFerm(ferm, prop, s, c);
-        FermToProp(prop, ferm, s, c);
-
-        std::cout << "Spin = " << s << ", Colour = " << c << std::endl;
-        ref -= prop;
-        std::cout << "Prop->Ferm->Prop test, diff = " << Grid::sqrt(norm2(ref)) << std::endl;
-
-        peekSite(fermSite, ferm, site);
-        peekSite(propSite, prop, site);
-        for (int s2 = 0; s2 < Ns; ++s2)
-        for (int c2 = 0; c2 < Nc; ++c2)
-        {
-            if (propSite()(s2, s)(c2, c) != fermSite()(s2)(c2))
-            {
-                std::cout << propSite()(s2, s)(c2, c) << " != "
-                          << fermSite()(s2)(c2) << " for spin = " << s2
-                          << ", col = " << c2 << std::endl;
-            }
-        }
-    }
-
-    Grid_finalize();
-    return EXIT_SUCCESS;
-}
diff --git a/tests/hadrons/Test_hadrons_seq_gamma.cc b/tests/hadrons/Test_hadrons_seq_gamma.cc
index ba711f88..5e10b9f5 100644
--- a/tests/hadrons/Test_hadrons_seq_gamma.cc
+++ b/tests/hadrons/Test_hadrons_seq_gamma.cc
@@ -1,29 +1,30 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
+/*************************************************************************************
 
- Source file: tests/hadrons/Test_hadrons_seq_gamma.cc
+Grid physics library, www.github.com/paboyle/Grid 
 
- Copyright (C) 2017
+Source file: Tests/Hadrons/Test_hadrons_seq_gamma.cc
+
+Copyright (C) 2015-2018
 
  Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
 
 #include "Test_hadrons.hpp"
 
diff --git a/tests/hadrons/Test_hadrons_spectrum.cc b/tests/hadrons/Test_hadrons_spectrum.cc
index 801674f7..af1dccd7 100644
--- a/tests/hadrons/Test_hadrons_spectrum.cc
+++ b/tests/hadrons/Test_hadrons_spectrum.cc
@@ -1,31 +1,33 @@
-/*******************************************************************************
- Grid physics library, www.github.com/paboyle/Grid
- 
- Source file: tests/hadrons/Test_hadrons_spectrum.cc
- 
- Copyright (C) 2015
- 
- Author: Antonin Portelli <antonin.portelli@me.com>
- 
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- 
- See the full license in the file "LICENSE" in the top level distribution
- directory.
- *******************************************************************************/
+/*************************************************************************************
 
-#include <Grid/Hadrons/Application.hpp>
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_hadrons_spectrum.cc
+
+Copyright (C) 2015-2018
+
+ Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Application.hpp>
+#include <Hadrons/Modules.hpp>
 
 using namespace Grid;
 using namespace Hadrons;
@@ -51,7 +53,7 @@ int main(int argc, char *argv[])
     globalPar.trajCounter.start = 1500;
     globalPar.trajCounter.end   = 1520;
     globalPar.trajCounter.step  = 20;
-    globalPar.seed              = "1 2 3 4";
+    globalPar.runId             = "test";
     application.setPar(globalPar);
     // gauge field
     application.createModule<MGauge::Unit>("gauge");
@@ -70,6 +72,7 @@ int main(int argc, char *argv[])
     
     // set fermion boundary conditions to be periodic space, antiperiodic time.
     std::string boundary = "1 1 1 -1";
+    std::string twist = "0. 0. 0. 0.";
 
     for (unsigned int i = 0; i < flavour.size(); ++i)
     {
@@ -80,12 +83,14 @@ int main(int argc, char *argv[])
         actionPar.M5    = 1.8;
         actionPar.mass  = mass[i];
         actionPar.boundary = boundary;
+        actionPar.twist = twist;
         application.createModule<MAction::DWF>("DWF_" + flavour[i], actionPar);
         
         // solvers
         MSolver::RBPrecCG::Par solverPar;
-        solverPar.action   = "DWF_" + flavour[i];
-        solverPar.residual = 1.0e-8;
+        solverPar.action       = "DWF_" + flavour[i];
+        solverPar.residual     = 1.0e-8;
+        solverPar.maxIteration = 10000;
         application.createModule<MSolver::RBPrecCG>("CG_" + flavour[i],
                                                     solverPar);
         
diff --git a/tests/hadrons/Test_hadrons_wilsonFund.cc b/tests/hadrons/Test_hadrons_wilsonFund.cc
new file mode 100644
index 00000000..df621812
--- /dev/null
+++ b/tests/hadrons/Test_hadrons_wilsonFund.cc
@@ -0,0 +1,159 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: Tests/Hadrons/Test_hadrons_wilsonFund.cc
+
+Copyright (C) 2015-2018
+
+ Author: Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Hadrons/Application.hpp>
+#include <Hadrons/Modules.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+
+int main(int argc, char *argv[])
+{
+    // initialization //////////////////////////////////////////////////////////
+    Grid_init(&argc, &argv);
+    HadronsLogError.Active(GridLogError.isActive());
+    HadronsLogWarning.Active(GridLogWarning.isActive());
+    HadronsLogMessage.Active(GridLogMessage.isActive());
+    HadronsLogIterative.Active(GridLogIterative.isActive());
+    HadronsLogDebug.Active(GridLogDebug.isActive());
+    LOG(Message) << "Grid initialized" << std::endl;
+    
+    // run setup ///////////////////////////////////////////////////////////////
+    Application              application;
+    std::vector<std::string> flavour = {"l"};
+    std::vector<double>      mass    = {-0.1};
+    double                   csw     = 0.0;
+    
+    // global parameters
+    Application::GlobalPar globalPar;
+  
+    globalPar.trajCounter.start = 309;
+    globalPar.trajCounter.end   = 310;
+    globalPar.trajCounter.step  = 1;
+    globalPar.runId             = "test";
+
+    application.setPar(globalPar);
+    // gauge field
+    application.createModule<MIO::LoadNersc>("gauge");
+   
+    // sources
+    //MSource::Z2::Par z2Par;
+    //z2Par.tA = 0;
+    //z2Par.tB = 0;
+    //application.createModule<MSource::Z2>("z2", z2Par);
+    MSource::Point::Par ptPar;
+    ptPar.position = "0 0 0 0";
+    application.createModule<MSource::Point>("pt", ptPar);
+    // sink
+    MSink::Point::Par sinkPar;
+    sinkPar.mom = "0 0 0";
+    application.createModule<MSink::ScalarPoint>("sink", sinkPar);
+    
+    // set fermion boundary conditions to be periodic space, antiperiodic time.
+    std::string boundary = "1 1 1 -1";
+
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    {
+        // actions
+        MAction::WilsonClover::Par actionPar;
+        actionPar.gauge = "gauge";
+        actionPar.mass  = mass[i];
+        actionPar.boundary = boundary;
+        actionPar.csw_r = csw;
+        actionPar.csw_t = csw;
+
+        // !!!!! Check if Anisotropy works  !!!!!
+        actionPar.clover_anisotropy.isAnisotropic= false;
+        actionPar.clover_anisotropy.t_direction  = 3    ;   // Explicit for D=4
+        actionPar.clover_anisotropy.xi_0         = 1.0  ;
+        actionPar.clover_anisotropy.nu           = 1.0  ;
+
+        application.createModule<MAction::WilsonClover>("WilsonClover_" + flavour[i], actionPar);
+        
+        // solvers
+        MSolver::RBPrecCG::Par solverPar;
+        solverPar.action       = "WilsonClover_" + flavour[i];
+        solverPar.residual     = 1.0e-8;
+        solverPar.maxIteration = 10000;
+        application.createModule<MSolver::RBPrecCG>("CG_" + flavour[i],
+                                                    solverPar);
+        
+        // propagators
+        MFermion::GaugeProp::Par quarkPar;
+        quarkPar.solver = "CG_" + flavour[i];
+        quarkPar.source = "pt";
+        application.createModule<MFermion::GaugeProp>("Qpt_" + flavour[i], quarkPar);
+ //       quarkPar.source = "z2";
+ //       application.createModule<MFermion::GaugeProp>("QZ2_" + flavour[i], quarkPar);
+    }
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    for (unsigned int j = i; j < flavour.size(); ++j)
+    {
+        MContraction::Meson::Par mesPar;
+        
+        mesPar.output  = "Fund_mesons/pt_" + flavour[i] + flavour[j];
+        mesPar.q1      = "Qpt_" + flavour[i];
+        mesPar.q2      = "Qpt_" + flavour[j];
+        mesPar.gammas  = "all";
+        mesPar.sink    = "sink";
+        application.createModule<MContraction::Meson>("meson_pt_"
+                                                      + flavour[i] + flavour[j],
+                                                      mesPar);
+   //     mesPar.output  = "mesons/Z2_" + flavour[i] + flavour[j];
+   //     mesPar.q1      = "QZ2_" + flavour[i];
+   //     mesPar.q2      = "QZ2_" + flavour[j];
+   //     mesPar.gammas  = "all";
+   //     mesPar.sink    = "sink";
+   //     application.createModule<MContraction::Meson>("meson_Z2_"
+   //                                                   + flavour[i] + flavour[j],
+   //                                                   mesPar);
+    }
+    for (unsigned int i = 0; i < flavour.size(); ++i)
+    for (unsigned int j = i; j < flavour.size(); ++j)
+    for (unsigned int k = j; k < flavour.size(); ++k)
+    {
+        MContraction::Baryon::Par barPar;
+        
+        barPar.output = "Fund_baryons/pt_" + flavour[i] + flavour[j] + flavour[k];
+        barPar.q1     = "Qpt_" + flavour[i];
+        barPar.q2     = "Qpt_" + flavour[j];
+        barPar.q3     = "Qpt_" + flavour[k];
+        application.createModule<MContraction::Baryon>(
+            "baryon_pt_" + flavour[i] + flavour[j] + flavour[k], barPar);
+    }
+    
+    // execution
+    application.saveParameterFile("WilsonClover_spectrum.xml");
+    application.run();
+    
+    // epilogue
+    LOG(Message) << "Grid is finalizing now" << std::endl;
+    Grid_finalize();
+    
+    return EXIT_SUCCESS;
+}
diff --git a/tests/hmc/Test_hmc_EODWFRatio.cc b/tests/hmc/Test_hmc_EODWFRatio.cc
index 402c46b9..93469ffe 100644
--- a/tests/hmc/Test_hmc_EODWFRatio.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio.cc
@@ -146,13 +146,7 @@ int main(int argc, char **argv) {
   std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl;
   DenOp.Report();
 
-
-
-
   Grid_finalize();
 
-
-
-
 } // main
 
diff --git a/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc b/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc
new file mode 100644
index 00000000..fbb8a4a3
--- /dev/null
+++ b/tests/hmc/Test_hmc_EOWilsonCloverFermionGauge.cc
@@ -0,0 +1,139 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2016
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonCloverFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.Resources.AddFourDimGrid("gauge");
+  // Possibile to create the module by hand 
+  // hardcoding parameters or using a Reader
+
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams;  
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection 
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes 
+  // that have a complex construction
+  // standard
+  RealD beta = 5.6 ;
+  WilsonGaugeActionR Waction(beta);
+  
+  // temporarily need a gauge field
+  auto GridPtr = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  LatticeGaugeField U(GridPtr);
+
+  Real mass = 0.01;
+  Real csw = 1.0;
+
+  FermionAction FermOp(U, *GridPtr, *GridRBPtr, mass, csw);
+
+  ConjugateGradient<FermionField> CG(1.0e-8, 2000);
+
+  TwoFlavourEvenOddPseudoFermionAction<FermionImplPolicy> Nf2(FermOp, CG, CG);
+
+    // Set smearing (true/false), default: false
+  Nf2.is_smeared = false;
+
+
+    // Collect actions
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+  /*
+    double rho = 0.1;  // smearing parameter
+    int Nsmear = 2;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(
+        UGrid, Nsmear, Stout);
+  */
+
+  // HMC parameters are serialisable 
+  TheHMC.Parameters.MD.MDsteps = 20;
+  TheHMC.Parameters.MD.trajL   = 1.0;
+
+  TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  TheHMC.Run();  // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
+
+
+
+
+
+
+
diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc
index b105efb3..7c2c42df 100644
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@@ -31,7 +31,10 @@ class ScalarActionParameters : Serializable {
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters,
     double, mass_squared,
-    double, lambda);
+    double, lambda,
+    double, g);
+
+  ScalarActionParameters() = default;
 
     template <class ReaderClass >
   ScalarActionParameters(Reader<ReaderClass>& Reader){
@@ -124,9 +127,12 @@ int main(int argc, char **argv) {
   TheHMC.Resources.AddGrid("scalar", ScalarGrid);
   std::cout << "Lattice size : " << GridDefaultLatt() << std::endl;
 
+  ScalarActionParameters SPar(Reader);
+
   // Checkpointer definition
   CheckpointerParameters CPparams(Reader);
-  TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
+  //TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
+  TheHMC.Resources.LoadScidacCheckpointer(CPparams, SPar);
 
   RNGModuleParameters RNGpar(Reader);
   TheHMC.Resources.SetRNGSeeds(RNGpar);
@@ -139,8 +145,7 @@ int main(int argc, char **argv) {
   // Collect actions, here use more encapsulation
 
   // Scalar action in adjoint representation
-  ScalarActionParameters SPar(Reader);
-  ScalarAction Saction(SPar.mass_squared, SPar.lambda);
+  ScalarAction Saction(SPar.mass_squared, SPar.lambda, SPar.g);
 
   // Collect actions
   ActionLevel<ScalarAction::Field, ScalarNxNMatrixFields<Ncolours>> Level1(1);
diff --git a/tests/hmc/Test_hmc_WC2ASFG_Production.cc b/tests/hmc/Test_hmc_WC2ASFG_Production.cc
new file mode 100644
index 00000000..d255ab5d
--- /dev/null
+++ b/tests/hmc/Test_hmc_WC2ASFG_Production.cc
@@ -0,0 +1,213 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+
+namespace Grid{
+  struct FermionParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters,
+            double, mass,
+            double, csw,
+				    double, StoppingCondition,
+				    int, MaxCGIterations,
+				    bool, ApplySmearing);
+  };
+
+  
+  struct WilsonCloverHMCParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverHMCParameters,
+				  double, gauge_beta,
+				  FermionParameters, WilsonClover)
+
+  template <class ReaderClass >
+  WilsonCloverHMCParameters(Reader<ReaderClass>& Reader){
+    read(Reader, "Action", *this);
+    }
+  };
+
+  struct SmearingParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
+				    double, rho,
+				    Integer, Nsmear)
+
+    template <class ReaderClass >
+    SmearingParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "StoutSmearing", *this);
+    }
+
+  };
+  
+  
+}
+
+int main(int argc, char **argv)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  typedef Representations< FundamentalRepresentation, TwoIndexAntiSymmetricRepresentation > TheRepresentations;  
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  // Typedefs to simplify notation
+  typedef GenericHMCRunnerHirep<TheRepresentations, MinimumNorm2> HMCWrapper; // Uses the default minimum norm
+  typedef WilsonTwoIndexAntiSymmetricImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions
+  typedef WilsonCloverTwoIndexAntiSymmetricFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...)
+  typedef typename FermionAction::FermionField FermionField;
+  typedef Grid::JSONReader Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.ReadCommandLine(argc, argv); 
+  if (TheHMC.ParameterFile.empty()){
+    std::cout << "Input file not specified."
+              << "Use --ParameterFile option in the command line.\nAborting" 
+              << std::endl;
+    exit(1);
+  }
+  Serialiser Reader(TheHMC.ParameterFile);
+  WilsonCloverHMCParameters MyParams(Reader);  
+
+  // Apply smearing to the fermionic action
+  bool ApplySmearing = MyParams.WilsonClover.ApplySmearing;
+
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams(Reader);
+  
+  /*
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  */
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar(Reader);
+  /*
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+  */
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  typedef PolyakovMod<HMCWrapper::ImplPolicy> PolyakovObs;
+  TheHMC.Resources.AddObservable<PolyakovObs>();
+
+  //typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
+  //TopologyObsParameters TopParams(Reader);
+  //TheHMC.Resources.AddObservable<QObs>(TopParams);
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes
+  // that have a complex construction
+  // standard
+  
+  //RealD beta = 5.6;
+  WilsonGaugeActionR Waction(MyParams.gauge_beta);
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  TwoIndexAntiSymmetricRepresentation::LatticeField U(GridPtr);
+
+  //Real mass = 0.01;
+  //Real csw = 1.0;
+
+  Real mass = MyParams.WilsonClover.mass;
+  Real csw = MyParams.WilsonClover.csw;
+
+  std::cout << "mass and csw" << mass << " and " << csw << std::endl; 
+
+  FermionAction FermOp(U, *GridPtr, *GridRBPtr, mass, csw, csw);
+  ConjugateGradient<FermionField> CG(MyParams.WilsonClover.StoppingCondition, MyParams.WilsonClover.MaxCGIterations);
+  TwoFlavourPseudoFermionAction<FermionImplPolicy> Nf2(FermOp, CG, CG);
+
+  // Set smearing (true/false), default: false
+  Nf2.is_smeared = ApplySmearing;
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field, TheRepresentations> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field, TheRepresentations> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+
+  /*
+    double rho = 0.1;  // smearing parameter
+    int Nsmear = 2;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(
+        UGrid, Nsmear, Stout);
+  */
+
+  // HMC parameters are serialisable
+
+  TheHMC.Parameters.initialize(Reader);
+  //TheHMC.Parameters.MD.MDsteps = 20;
+  //TheHMC.Parameters.MD.trajL = 1.0;
+
+  if (ApplySmearing){
+    SmearingParameters SmPar(Reader);
+    //double rho = 0.1;  // smearing parameter
+    //int Nsmear = 3;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, SmPar.Nsmear, Stout);
+    TheHMC.Run(SmearingPolicy); // for smearing
+  } else {
+    TheHMC.Run();  // no smearing
+  }
+
+  //TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  //TheHMC.Run();                       // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
+
diff --git a/tests/hmc/Test_hmc_WC2SFG_Production.cc b/tests/hmc/Test_hmc_WC2SFG_Production.cc
new file mode 100644
index 00000000..8d5fc458
--- /dev/null
+++ b/tests/hmc/Test_hmc_WC2SFG_Production.cc
@@ -0,0 +1,212 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+
+namespace Grid{
+  struct FermionParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters,
+            double, mass,
+            double, csw,
+				    double, StoppingCondition,
+				    int, MaxCGIterations,
+				    bool, ApplySmearing);
+  };
+
+  
+  struct WilsonCloverHMCParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverHMCParameters,
+				  double, gauge_beta,
+				  FermionParameters, WilsonClover)
+
+  template <class ReaderClass >
+  WilsonCloverHMCParameters(Reader<ReaderClass>& Reader){
+    read(Reader, "Action", *this);
+    }
+  };
+
+  struct SmearingParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
+				    double, rho,
+				    Integer, Nsmear)
+
+    template <class ReaderClass >
+    SmearingParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "StoutSmearing", *this);
+    }
+
+  };
+  
+  
+}
+
+int main(int argc, char **argv)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  typedef Representations< FundamentalRepresentation, TwoIndexSymmetricRepresentation > TheRepresentations;  
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  // Typedefs to simplify notation
+  typedef GenericHMCRunnerHirep<TheRepresentations, MinimumNorm2> HMCWrapper; // Uses the default minimum norm
+  typedef WilsonTwoIndexSymmetricImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions
+  typedef WilsonCloverTwoIndexSymmetricFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...)
+  typedef typename FermionAction::FermionField FermionField;
+  typedef Grid::JSONReader Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.ReadCommandLine(argc, argv); 
+  if (TheHMC.ParameterFile.empty()){
+    std::cout << "Input file not specified."
+              << "Use --ParameterFile option in the command line.\nAborting" 
+              << std::endl;
+    exit(1);
+  }
+  Serialiser Reader(TheHMC.ParameterFile);
+  WilsonCloverHMCParameters MyParams(Reader);  
+
+  // Apply smearing to the fermionic action
+  bool ApplySmearing = MyParams.WilsonClover.ApplySmearing;
+
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams(Reader);
+  
+  /*
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  */
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar(Reader);
+  /*
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+  */
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  typedef PolyakovMod<HMCWrapper::ImplPolicy> PolyakovObs;
+  TheHMC.Resources.AddObservable<PolyakovObs>();
+
+  //typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
+  //TopologyObsParameters TopParams(Reader);
+  //TheHMC.Resources.AddObservable<QObs>(TopParams);
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes
+  // that have a complex construction
+  // standard
+  
+  //RealD beta = 5.6;
+  WilsonGaugeActionR Waction(MyParams.gauge_beta);
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  TwoIndexSymmetricRepresentation::LatticeField U(GridPtr);
+
+  //Real mass = 0.01;
+  //Real csw = 1.0;
+
+  Real mass = MyParams.WilsonClover.mass;
+  Real csw = MyParams.WilsonClover.csw;
+
+  std::cout << "mass and csw" << mass << " and " << csw << std::endl; 
+
+  FermionAction FermOp(U, *GridPtr, *GridRBPtr, mass, csw, csw);
+  ConjugateGradient<FermionField> CG(MyParams.WilsonClover.StoppingCondition, MyParams.WilsonClover.MaxCGIterations);
+  TwoFlavourPseudoFermionAction<FermionImplPolicy> Nf2(FermOp, CG, CG);
+
+  // Set smearing (true/false), default: false
+  Nf2.is_smeared = ApplySmearing;
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field, TheRepresentations> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field, TheRepresentations> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+
+  /*
+    double rho = 0.1;  // smearing parameter
+    int Nsmear = 2;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(
+        UGrid, Nsmear, Stout);
+  */
+
+  // HMC parameters are serialisable
+
+  TheHMC.Parameters.initialize(Reader);
+  //TheHMC.Parameters.MD.MDsteps = 20;
+  //TheHMC.Parameters.MD.trajL = 1.0;
+
+  if (ApplySmearing){
+    SmearingParameters SmPar(Reader);
+    //double rho = 0.1;  // smearing parameter
+    //int Nsmear = 3;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, SmPar.Nsmear, Stout);
+    TheHMC.Run(SmearingPolicy); // for smearing
+  } else {
+    TheHMC.Run();  // no smearing
+  }
+
+  //TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  //TheHMC.Run();                       // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
diff --git a/tests/hmc/Test_hmc_WCFG_Production.cc b/tests/hmc/Test_hmc_WCFG_Production.cc
new file mode 100644
index 00000000..895e4f81
--- /dev/null
+++ b/tests/hmc/Test_hmc_WCFG_Production.cc
@@ -0,0 +1,210 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+
+namespace Grid{
+  struct FermionParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters,
+            double, mass,
+            double, csw,
+				    double, StoppingCondition,
+				    int, MaxCGIterations,
+				    bool, ApplySmearing);
+  };
+
+  
+  struct WilsonCloverHMCParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverHMCParameters,
+				  double, gauge_beta,
+				  FermionParameters, WilsonClover)
+
+  template <class ReaderClass >
+  WilsonCloverHMCParameters(Reader<ReaderClass>& Reader){
+    read(Reader, "Action", *this);
+    }
+  };
+
+  struct SmearingParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
+				    double, rho,
+				    Integer, Nsmear)
+
+    template <class ReaderClass >
+    SmearingParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "StoutSmearing", *this);
+    }
+
+  };
+  
+  
+}
+
+int main(int argc, char **argv)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  // Typedefs to simplify notation
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; // Uses the default minimum norm
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonCloverFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+  typedef Grid::JSONReader Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.ReadCommandLine(argc, argv); 
+  if (TheHMC.ParameterFile.empty()){
+    std::cout << "Input file not specified."
+              << "Use --ParameterFile option in the command line.\nAborting" 
+              << std::endl;
+    exit(1);
+  }
+  Serialiser Reader(TheHMC.ParameterFile);
+  WilsonCloverHMCParameters MyParams(Reader);  
+
+  // Apply smearing to the fermionic action
+  bool ApplySmearing = MyParams.WilsonClover.ApplySmearing;
+
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams(Reader);
+  
+  /*
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  */
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar(Reader);
+  /*
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+  */
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  typedef PolyakovMod<HMCWrapper::ImplPolicy> PolyakovObs;
+  TheHMC.Resources.AddObservable<PolyakovObs>();
+
+  //typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
+  //TopologyObsParameters TopParams(Reader);
+  //TheHMC.Resources.AddObservable<QObs>(TopParams);
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes
+  // that have a complex construction
+  // standard
+  
+  //RealD beta = 5.6;
+  WilsonGaugeActionR Waction(MyParams.gauge_beta);
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  //Real mass = 0.01;
+  //Real csw = 1.0;
+
+  Real mass = MyParams.WilsonClover.mass;
+  Real csw = MyParams.WilsonClover.csw;
+
+  std::cout << "mass and csw" << mass << " and " << csw << std::endl; 
+
+  FermionAction FermOp(U, *GridPtr, *GridRBPtr, mass, csw, csw);
+  ConjugateGradient<FermionField> CG(MyParams.WilsonClover.StoppingCondition, MyParams.WilsonClover.MaxCGIterations);
+  TwoFlavourPseudoFermionAction<FermionImplPolicy> Nf2(FermOp, CG, CG);
+
+  // Set smearing (true/false), default: false
+  Nf2.is_smeared = ApplySmearing;
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+
+  /*
+    double rho = 0.1;  // smearing parameter
+    int Nsmear = 2;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(
+        UGrid, Nsmear, Stout);
+  */
+
+  // HMC parameters are serialisable
+
+  TheHMC.Parameters.initialize(Reader);
+  //TheHMC.Parameters.MD.MDsteps = 20;
+  //TheHMC.Parameters.MD.trajL = 1.0;
+
+  if (ApplySmearing){
+    SmearingParameters SmPar(Reader);
+    //double rho = 0.1;  // smearing parameter
+    //int Nsmear = 3;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, SmPar.Nsmear, Stout);
+    TheHMC.Run(SmearingPolicy); // for smearing
+  } else {
+    TheHMC.Run();  // no smearing
+  }
+
+  //TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  //TheHMC.Run();                       // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
diff --git a/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc b/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc
new file mode 100644
index 00000000..aa5cce85
--- /dev/null
+++ b/tests/hmc/Test_hmc_WCMixedRepFG_Production.cc
@@ -0,0 +1,224 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonAdjointFermionGauge.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include "Grid/Grid.h"
+
+
+namespace Grid{
+  struct FermionParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters,
+            double, mass,
+            double, csw,
+				    double, StoppingCondition,
+				    int, MaxCGIterations,
+				    bool, ApplySmearing);
+  };
+
+  struct WilsonCloverHMCParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverHMCParameters,
+				  double, gauge_beta,
+          FermionParameters, WilsonCloverFund, 
+          FermionParameters, WilsonCloverAS)
+
+  template <class ReaderClass >
+  WilsonCloverHMCParameters(Reader<ReaderClass>& Reader){
+    read(Reader, "Action", *this);
+    }
+  };
+
+  struct SmearingParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
+				    double, rho,
+				    Integer, Nsmear)
+
+    template <class ReaderClass >
+    SmearingParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "StoutSmearing", *this);
+    }
+
+  }; 
+}
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  // Here change the allowed (higher) representations
+  typedef Representations< FundamentalRepresentation, TwoIndexAntiSymmetricRepresentation> TheRepresentations;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+   // Typedefs to simplify notation
+  typedef GenericHMCRunnerHirep<TheRepresentations, MinimumNorm2> HMCWrapper;
+
+  typedef WilsonImplR FundImplPolicy;
+  typedef WilsonCloverFermionR FundFermionAction; 
+  typedef typename FundFermionAction::FermionField FundFermionField;
+
+  typedef WilsonTwoIndexAntiSymmetricImplR ASymmImplPolicy; 
+  typedef WilsonCloverTwoIndexAntiSymmetricFermionR ASymmFermionAction; 
+  typedef typename ASymmFermionAction::FermionField ASymmFermionField;
+
+  typedef Grid::JSONReader Serialiser;
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+  
+    // Grid from the command line
+    TheHMC.ReadCommandLine(argc, argv); 
+    if (TheHMC.ParameterFile.empty()){
+      std::cout << "Input file not specified."
+                << "Use --ParameterFile option in the command line.\nAborting" 
+                << std::endl;
+      exit(1);
+    }
+    Serialiser Reader(TheHMC.ParameterFile);
+    WilsonCloverHMCParameters MyParams(Reader);  
+  
+    // Apply smearing to the fermionic action
+    bool ApplySmearingFund = MyParams.WilsonCloverFund.ApplySmearing;
+    bool ApplySmearingAS = MyParams.WilsonCloverAS.ApplySmearing;
+    
+
+    TheHMC.Resources.AddFourDimGrid("gauge");
+  
+    // Checkpointer definition
+    CheckpointerParameters CPparams(Reader);
+    
+    /*
+    CPparams.config_prefix = "ckpoint_lat";
+    CPparams.rng_prefix = "ckpoint_rng";
+    CPparams.saveInterval = 5;
+    CPparams.format = "IEEE64BIG";
+    */
+    
+    TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+  
+    RNGModuleParameters RNGpar(Reader);
+    /*
+    RNGpar.serial_seeds = "1 2 3 4 5";
+    RNGpar.parallel_seeds = "6 7 8 9 10";
+    TheHMC.Resources.SetRNGSeeds(RNGpar);
+    */
+    TheHMC.Resources.SetRNGSeeds(RNGpar);
+  
+    // Construct observables
+    typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+    TheHMC.Resources.AddObservable<PlaqObs>();
+  
+    typedef PolyakovMod<HMCWrapper::ImplPolicy> PolyakovObs;
+    TheHMC.Resources.AddObservable<PolyakovObs>();
+  
+    typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
+    TopologyObsParameters TopParams(Reader);
+    TheHMC.Resources.AddObservable<QObs>(TopParams);
+    //////////////////////////////////////////////
+  
+    /////////////////////////////////////////////////////////////
+    // Collect actions, here use more encapsulation
+    // need wrappers of the fermionic classes
+    // that have a complex construction
+    // standard
+    
+    //RealD beta = 5.6;
+    WilsonGaugeActionR Waction(MyParams.gauge_beta);
+    
+      auto GridPtr   = TheHMC.Resources.GetCartesian();
+      auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+    
+      // temporarily need a gauge field
+      FundamentalRepresentation::LatticeField UF(GridPtr);
+      TwoIndexAntiSymmetricRepresentation::LatticeField UAS(GridPtr);
+
+
+      Real Fundmass = MyParams.WilsonCloverFund.mass;
+      Real Fundcsw = MyParams.WilsonCloverFund.csw;
+      Real ASmass = MyParams.WilsonCloverAS.mass;
+      Real AScsw = MyParams.WilsonCloverAS.csw;
+
+      
+
+  std::cout << "Fund: mass and csw" << Fundmass << " and " << Fundcsw << std::endl; 
+  std::cout << "AS  : mass and csw" << ASmass << " and " << AScsw << std::endl; 
+  
+  
+  FundFermionAction FundFermOp(UF, *GridPtr, *GridRBPtr, Fundmass, Fundcsw, Fundcsw);
+  ConjugateGradient<FundFermionField> CG_Fund(MyParams.WilsonCloverFund.StoppingCondition, MyParams.WilsonCloverFund.MaxCGIterations);
+  TwoFlavourPseudoFermionAction<FundImplPolicy> Nf2_Fund(FundFermOp, CG_Fund, CG_Fund);
+
+  ASymmFermionAction ASFermOp(UAS, *GridPtr, *GridRBPtr, ASmass, AScsw, AScsw);
+  ConjugateGradient<ASymmFermionField> CG_AS(MyParams.WilsonCloverAS.StoppingCondition, MyParams.WilsonCloverAS.MaxCGIterations);
+  TwoFlavourPseudoFermionAction<ASymmImplPolicy> Nf2_AS(ASFermOp, CG_AS, CG_AS);
+
+  Nf2_Fund.is_smeared = ApplySmearingFund;
+  Nf2_AS.is_smeared   = ApplySmearingAS;
+  
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field, TheRepresentations > Level1(1);
+  Level1.push_back(&Nf2_Fund);
+  Level1.push_back(&Nf2_AS);
+
+
+  ActionLevel<HMCWrapper::Field, TheRepresentations > Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+
+  TheHMC.Parameters.initialize(Reader);
+  //TheHMC.Parameters.MD.MDsteps = 20;
+  //TheHMC.Parameters.MD.trajL = 1.0;
+/*
+  if (ApplySmearingFund || ApplySmearingAS){
+    SmearingParameters SmPar(Reader);
+    //double rho = 0.1;  // smearing parameter
+    //int Nsmear = 3;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, SmPar.Nsmear, Stout);
+    TheHMC.Run(SmearingPolicy); // for smearing
+  } else {
+    TheHMC.Run();  // no smearing
+  }
+*/
+  TheHMC.Run(); 
+
+
+  //TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  //TheHMC.Run();                       // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
diff --git a/tests/hmc/Test_hmc_WCadjFG_Production.cc b/tests/hmc/Test_hmc_WCadjFG_Production.cc
new file mode 100644
index 00000000..48cea756
--- /dev/null
+++ b/tests/hmc/Test_hmc_WCadjFG_Production.cc
@@ -0,0 +1,213 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+
+namespace Grid{
+  struct FermionParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(FermionParameters,
+            double, mass,
+            double, csw,
+				    double, StoppingCondition,
+				    int, MaxCGIterations,
+				    bool, ApplySmearing);
+  };
+
+  
+  struct WilsonCloverHMCParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonCloverHMCParameters,
+				  double, gauge_beta,
+				  FermionParameters, WilsonClover)
+
+  template <class ReaderClass >
+  WilsonCloverHMCParameters(Reader<ReaderClass>& Reader){
+    read(Reader, "Action", *this);
+    }
+  };
+
+  struct SmearingParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
+				    double, rho,
+				    Integer, Nsmear)
+
+    template <class ReaderClass >
+    SmearingParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "StoutSmearing", *this);
+    }
+
+  };
+  
+  
+}
+
+int main(int argc, char **argv)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  typedef Representations< FundamentalRepresentation, AdjointRepresentation > TheRepresentations;  
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  // Typedefs to simplify notation
+  typedef GenericHMCRunnerHirep<TheRepresentations, MinimumNorm2> HMCWrapper; // Uses the default minimum norm
+  typedef WilsonAdjImplR FermionImplPolicy; // gauge field implemetation for the pseudofermions
+  typedef WilsonCloverAdjFermionR FermionAction; // type of lattice fermions (Wilson, DW, ...)
+  typedef typename FermionAction::FermionField FermionField;
+  typedef Grid::JSONReader Serialiser;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.ReadCommandLine(argc, argv); 
+  if (TheHMC.ParameterFile.empty()){
+    std::cout << "Input file not specified."
+              << "Use --ParameterFile option in the command line.\nAborting" 
+              << std::endl;
+    exit(1);
+  }
+  Serialiser Reader(TheHMC.ParameterFile);
+  WilsonCloverHMCParameters MyParams(Reader);  
+
+  // Apply smearing to the fermionic action
+  bool ApplySmearing = MyParams.WilsonClover.ApplySmearing;
+
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams(Reader);
+  
+  /*
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+  */
+  
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar(Reader);
+  /*
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+  */
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  typedef PolyakovMod<HMCWrapper::ImplPolicy> PolyakovObs;
+  TheHMC.Resources.AddObservable<PolyakovObs>();
+
+  typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
+  TopologyObsParameters TopParams(Reader);
+  TheHMC.Resources.AddObservable<QObs>(TopParams);
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes
+  // that have a complex construction
+  // standard
+  
+  //RealD beta = 5.6;
+  WilsonGaugeActionR Waction(MyParams.gauge_beta);
+
+  auto GridPtr   = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  AdjointRepresentation::LatticeField U(GridPtr);
+
+  //Real mass = 0.01;
+  //Real csw = 1.0;
+
+  Real mass = MyParams.WilsonClover.mass;
+  Real csw = MyParams.WilsonClover.csw;
+
+  std::cout << "mass and csw" << mass << " and " << csw << std::endl; 
+
+  FermionAction FermOp(U, *GridPtr, *GridRBPtr, mass, csw, csw);
+  ConjugateGradient<FermionField> CG(MyParams.WilsonClover.StoppingCondition, MyParams.WilsonClover.MaxCGIterations);
+  TwoFlavourPseudoFermionAction<FermionImplPolicy> Nf2(FermOp, CG, CG);
+
+  // Set smearing (true/false), default: false
+  Nf2.is_smeared = ApplySmearing;
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field, TheRepresentations> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field, TheRepresentations> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+
+  /*
+    double rho = 0.1;  // smearing parameter
+    int Nsmear = 2;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(
+        UGrid, Nsmear, Stout);
+  */
+
+  // HMC parameters are serialisable
+
+  TheHMC.Parameters.initialize(Reader);
+  //TheHMC.Parameters.MD.MDsteps = 20;
+  //TheHMC.Parameters.MD.trajL = 1.0;
+
+  if (ApplySmearing){
+    SmearingParameters SmPar(Reader);
+    //double rho = 0.1;  // smearing parameter
+    //int Nsmear = 3;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, SmPar.Nsmear, Stout);
+    TheHMC.Run(SmearingPolicy); // for smearing
+  } else {
+    TheHMC.Run();  // no smearing
+  }
+
+  //TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  //TheHMC.Run();                       // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
+
diff --git a/tests/hmc/Test_hmc_WG_Production.cc b/tests/hmc/Test_hmc_WG_Production.cc
new file mode 100644
index 00000000..7f8d8124
--- /dev/null
+++ b/tests/hmc/Test_hmc_WG_Production.cc
@@ -0,0 +1,117 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2015
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+namespace Grid{
+  struct ActionParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ActionParameters,
+				    double, beta)
+
+    ActionParameters() = default;
+
+    template <class ReaderClass >
+    ActionParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "Action", *this);
+    }
+
+  };
+
+}
+
+
+int main(int argc, char **argv) {
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  GridLogLayout();
+
+  // Typedefs to simplify notation
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  // Uses the default minimum norm
+  HMCWrapper TheHMC;
+  typedef Grid::JSONReader       Serialiser;
+
+  // Grid from the command line
+  TheHMC.Resources.AddFourDimGrid("gauge");
+  TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  // Reader, file should come from command line
+  if (TheHMC.ParameterFile.empty()){
+    std::cout << "Input file not specified."
+	      << "Use --ParameterFile option in the command line.\nAborting"
+	      << std::endl;
+    exit(1);
+  }
+  Serialiser Reader(TheHMC.ParameterFile);
+
+  // Read parameters from input file
+  ActionParameters WilsonPar(Reader);
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams(Reader);
+  //TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  // Store metadata in the Scidac checkpointer
+  TheHMC.Resources.LoadScidacCheckpointer(CPparams, WilsonPar);
+
+  RNGModuleParameters RNGpar(Reader);
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  // here there is too much indirection
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+  TopologyObsParameters TopParams(Reader);
+  TheHMC.Resources.AddObservable<QObs>(TopParams);
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes
+  // that have a complex construction
+  // standard
+  WilsonGaugeActionR Waction(WilsonPar.beta);
+
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  Level1.push_back(&Waction);
+  //Level1.push_back(WGMod.getPtr());
+  TheHMC.TheAction.push_back(Level1);
+  /////////////////////////////////////////////////////////////
+
+  // HMC parameters are serialisable
+  TheHMC.Parameters.initialize(Reader);
+
+  //TheHMC.Parameters.MD.MDsteps = 17;
+  //TheHMC.Parameters.MD.trajL   = 1.0;
+
+  TheHMC.Run();  // no smearing
+
+  Grid_finalize();
+
+} // main
diff --git a/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc b/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc
new file mode 100644
index 00000000..8d5701b8
--- /dev/null
+++ b/tests/hmc/Test_hmc_WilsonCloverFermionGauge.cc
@@ -0,0 +1,129 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_hmc_WilsonFermionGauge.cc
+
+Copyright (C) 2017
+
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+int main(int argc, char **argv)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid_init(&argc, &argv);
+  int threads = GridThread::GetThreads();
+  // here make a routine to print all the relevant information on the run
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  // Typedefs to simplify notation
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; // Uses the default minimum norm
+  typedef WilsonImplR FermionImplPolicy;
+  typedef WilsonCloverFermionR FermionAction;
+  typedef typename FermionAction::FermionField FermionField;
+
+  //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+  HMCWrapper TheHMC;
+
+  // Grid from the command line
+  TheHMC.Resources.AddFourDimGrid("gauge");
+
+  // Checkpointer definition
+  CheckpointerParameters CPparams;
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.saveInterval = 5;
+  CPparams.format = "IEEE64BIG";
+
+  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
+
+  RNGModuleParameters RNGpar;
+  RNGpar.serial_seeds = "1 2 3 4 5";
+  RNGpar.parallel_seeds = "6 7 8 9 10";
+  TheHMC.Resources.SetRNGSeeds(RNGpar);
+
+  // Construct observables
+  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
+  TheHMC.Resources.AddObservable<PlaqObs>();
+
+  typedef PolyakovMod<HMCWrapper::ImplPolicy> PolyakovObs;
+  TheHMC.Resources.AddObservable<PolyakovObs>();
+  //////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////
+  // Collect actions, here use more encapsulation
+  // need wrappers of the fermionic classes
+  // that have a complex construction
+  // standard
+  RealD beta = 5.6;
+  WilsonGaugeActionR Waction(beta);
+
+  auto GridPtr = TheHMC.Resources.GetCartesian();
+  auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
+
+  // temporarily need a gauge field
+  LatticeGaugeField U(GridPtr);
+
+  Real mass = 0.01;
+  Real csw = 1.0;
+
+  FermionAction FermOp(U, *GridPtr, *GridRBPtr, mass, csw);
+  ConjugateGradient<FermionField> CG(1.0e-8, 5000);
+
+  TwoFlavourPseudoFermionAction<FermionImplPolicy> Nf2(FermOp, CG, CG);
+
+  // Set smearing (true/false), default: false
+  Nf2.is_smeared = false;
+
+  // Collect actions
+  ActionLevel<HMCWrapper::Field> Level1(1);
+  Level1.push_back(&Nf2);
+
+  ActionLevel<HMCWrapper::Field> Level2(4);
+  Level2.push_back(&Waction);
+
+  TheHMC.TheAction.push_back(Level1);
+  TheHMC.TheAction.push_back(Level2);
+  /////////////////////////////////////////////////////////////
+
+  /*
+    double rho = 0.1;  // smearing parameter
+    int Nsmear = 2;    // number of smearing levels
+    Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
+    SmearedConfiguration<HMCWrapper::ImplPolicy> SmearingPolicy(
+        UGrid, Nsmear, Stout);
+  */
+
+  // HMC parameters are serialisable
+  TheHMC.Parameters.MD.MDsteps = 20;
+  TheHMC.Parameters.MD.trajL = 1.0;
+
+  TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file
+  TheHMC.Run();                       // no smearing
+  // TheHMC.Run(SmearingPolicy); // for smearing
+
+  Grid_finalize();
+
+} // main
diff --git a/tests/lanczos/Test_WCMultiRep_lanczos.cc b/tests/lanczos/Test_WCMultiRep_lanczos.cc
new file mode 100644
index 00000000..b6d69aee
--- /dev/null
+++ b/tests/lanczos/Test_WCMultiRep_lanczos.cc
@@ -0,0 +1,178 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_dwf_lanczos.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+//typedef WilsonCloverFermionR FermionOp;
+//typedef typename WilsonFermionR::FermionField FermionField;
+
+typedef WilsonImplR FundImplPolicy;
+typedef WilsonCloverFermionR FundFermionAction; 
+typedef typename FundFermionAction::FermionField FundFermionField;
+
+typedef WilsonTwoIndexAntiSymmetricImplR ASymmImplPolicy; 
+typedef WilsonCloverTwoIndexAntiSymmetricFermionR ASymmFermionAction; 
+typedef typename ASymmFermionAction::FermionField ASymmFermionField;
+
+
+RealD AllZero(RealD x) { return 0.; }
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian* FGrid = UGrid;
+  GridRedBlackCartesian* FrbGrid = UrbGrid;
+  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid,
+         FrbGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG RNG5rb(FrbGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+
+  GridParallelRNG          pRNG(UGrid); 
+  GridSerialRNG            sRNG; 
+
+  FundamentalRepresentation::LatticeField Umu(UGrid);
+  
+  TwoIndexAntiSymmetricRepresentation HiRep(UGrid);
+  TwoIndexAntiSymmetricRepresentation::LatticeField UmuAS(UGrid);
+
+  
+  CheckpointerParameters CPparams;
+  
+  CPparams.config_prefix = "ckpoint_lat";
+  CPparams.rng_prefix = "ckpoint_rng";
+  CPparams.format = "IEEE64BIG";
+
+//NerscHmcCheckpointer<PeriodicGimplR> Checkpoint(std::string("ckpoint_lat"),
+ //                                                 std::string("ckpoint_rng"), 1);
+ 
+NerscHmcCheckpointer<PeriodicGimplR> Checkpoint(CPparams);
+
+  int CNFGSTART=1;
+  int CNFGEND=2;
+  int CNFGSTEP=1;
+
+  Real Fundmass = -0.1;
+  Real Fundcsw  =  1.0;
+  Real ASmass   = -0.1;
+  Real AScsw    =  1.0;
+    
+  std::cout << "Fund: mass and csw" << Fundmass << " and " << Fundcsw << std::endl; 
+  std::cout << "AS  : mass and csw" << ASmass << " and " << AScsw << std::endl; 
+
+  const int Nstop = 30;
+  const int Nk = 40;
+  const int Np = 40;
+  const int Nm = Nk + Np;
+  const int MaxIt = 10000;
+  RealD resid = 1.0e-8;
+
+    for (int cnfg=CNFGSTART;cnfg<=CNFGEND;cnfg+=CNFGSTEP){
+      Checkpoint.CheckpointRestore(cnfg,Umu, sRNG, pRNG);
+
+  //SU4::HotConfiguration(RNG4, Umu); // temporary, then read.
+  
+  HiRep.update_representation(Umu);
+  UmuAS = HiRep.U;
+
+  FundFermionAction FundFermOp(Umu,*FGrid,*FrbGrid, Fundmass, Fundcsw, Fundcsw);
+  MdagMLinearOperator<FundFermionAction,FundFermionField> HermOpFund(FundFermOp); /// <-----
+  
+  ASymmFermionAction ASFermOp(UmuAS,*FGrid,*FrbGrid, ASmass, AScsw, AScsw);
+  MdagMLinearOperator<ASymmFermionAction,ASymmFermionField> HermOpAS(ASFermOp); /// <-----
+  
+  std::vector<double> Coeffs{0, -1.};
+  Polynomial<FundFermionField> FundPolyX(Coeffs);
+  //Chebyshev<FundFermionField> FundCheb(0.0, 10., 12);
+  
+  FunctionHermOp<FundFermionField> FundPolyXOp(FundPolyX,HermOpFund);
+  PlainHermOp<FundFermionField>    FundOp     (HermOpFund);
+
+  ImplicitlyRestartedLanczos<FundFermionField> IRL_Fund(FundOp, FundPolyXOp, Nstop, Nk, Nm,
+                                               resid, MaxIt);
+  
+  Polynomial<ASymmFermionField> ASPolyX(Coeffs);
+  //Chebyshev<ASymmFermionField> ASCheb(0.0, 10., 12);
+
+  FunctionHermOp<ASymmFermionField> ASPolyXOp(ASPolyX,HermOpAS);
+  PlainHermOp<ASymmFermionField>    ASOp     (HermOpAS);
+
+  ImplicitlyRestartedLanczos<ASymmFermionField> IRL_AS(ASOp, ASPolyXOp, Nstop, Nk, Nm,
+                                               resid, MaxIt);
+                                               
+  std::vector<RealD> Fundeval(Nm);
+  std::vector<RealD> ASeval(Nm);
+
+  FundFermionField Fundsrc(FGrid);
+  ASymmFermionField   ASsrc(FGrid);
+  
+  gaussian(RNG5, Fundsrc);
+  gaussian(RNG5, ASsrc);
+
+  std::vector<FundFermionField> Fundevec(Nm, FGrid);
+  std::vector<ASymmFermionField>   ASevec(Nm, FGrid);
+  
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << "Fund: grid pointer " << Fundevec[i]._grid
+              << std::endl;
+  };
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << "AS: grid pointer " << ASevec[i]._grid
+              << std::endl;
+  };
+  
+  int FundNconv, ASNconv;
+  IRL_Fund.calc(Fundeval, Fundevec, Fundsrc, FundNconv);
+  IRL_AS.calc(ASeval, ASevec, ASsrc, ASNconv);
+
+      for (int i=0;i<FundNconv;i++){
+      std::cout << "Fund: eval[" << i << "] = " << Fundeval[i] << std::endl;
+    }  
+    for (int i=0;i<ASNconv;i++){
+      std::cout << "2Index: eval[" << i << "] = " << ASeval[i] << std::endl;
+    }  
+    }
+
+  Grid_finalize();
+}
diff --git a/tests/lanczos/Test_compressed_lanczos.cc b/tests/lanczos/Test_compressed_lanczos.cc
new file mode 100644
index 00000000..8bce82bb
--- /dev/null
+++ b/tests/lanczos/Test_compressed_lanczos.cc
@@ -0,0 +1,253 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc
+
+    Copyright (C) 2017
+
+Author: Leans heavily on Christoph Lehner's code
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
+ *  in Grid that were intended to be used to support blocked Aggregates, from
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
+{ 
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
+			      LinearOperatorBase<FineField> &FineOp,
+			      int checkerboard) 
+    // Base constructor
+    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
+  {};
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+    assert(this->subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR(this->_FineGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+  }
+
+  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
+  {
+    this->evals_fine.resize(nbasis);
+    this->subspace.resize(nbasis,this->_FineGrid);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_fine);
+    
+    assert(this->evals_fine.size()==nbasis);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
+      
+    }
+    RD.close();
+  }
+
+  void checkpointCoarse(std::string evecs_file,std::string evals_file)
+  {
+    int n = this->evec_coarse.size();
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR(this->_CoarseGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<n;k++) {
+      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_coarse);
+  }
+
+  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
+  {
+    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
+    this->evals_coarse.resize(nvec);
+    this->evec_coarse.resize(nvec,this->_CoarseGrid);
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_coarse);
+
+    assert(this->evals_coarse.size()==nvec);
+    emptyUserRecord record;
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nvec;k++) {
+      RD.readScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    RD.close();
+  }
+};
+
+int main (int argc, char ** argv) {
+
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  LocalCoherenceLanczosParams Params;
+  {
+    Params.omega.resize(10);
+    Params.blockSize.resize(5);
+    XmlWriter writer("Params_template.xml");
+    write(writer,"Params",Params);
+    std::cout << GridLogMessage << " Written Params_template.xml" <<std::endl;
+  }
+  
+  { 
+    XmlReader reader(std::string("./Params.xml"));
+    read(reader, "Params", Params);
+  }
+
+  int     Ls = (int)Params.omega.size();
+  RealD mass = Params.mass;
+  RealD M5   = Params.M5;
+  std::vector<int> blockSize = Params.blockSize;
+
+  // Grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								     GridDefaultSimd(Nd,vComplex::Nsimd()),
+								     GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> fineLatt     = GridDefaultLatt();
+  int dims=fineLatt.size();
+  assert(blockSize.size()==dims+1);
+  std::vector<int> coarseLatt(dims);
+  std::vector<int> coarseLatt5d ;
+
+  for (int d=0;d<coarseLatt.size();d++){
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<coarseLatt.size();i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+
+  // Gauge field
+  LatticeGaugeField Umu(UGrid);
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu,header,Params.config);
+  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
+
+  // ZMobius EO Operator
+  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.);
+  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
+
+  // Eigenvector storage
+  LanczosParams fine  =Params.FineParams;  
+  LanczosParams coarse=Params.CoarseParams;  
+
+  const int Ns1 = fine.Nstop;   const int Ns2 = coarse.Nstop;
+  const int Nk1 = fine.Nk;      const int Nk2 = coarse.Nk;
+  const int Nm1 = fine.Nm;      const int Nm2 = coarse.Nm;
+
+  std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
+  std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
+  assert(Nm2 >= Nm1);
+
+  const int nbasis= 60;
+  assert(nbasis==Ns1);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
+  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
+
+  assert( (Params.doFine)||(Params.doFineRead));
+
+  if ( Params.doFine ) { 
+    std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
+    _LocalCoherenceLanczos.calcFine(fine.Cheby,
+		 fine.Nstop,fine.Nk,fine.Nm,
+		 fine.resid,fine.MaxIt, 
+		 fine.betastp,fine.MinRes);
+
+    std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
+    _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
+    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
+    _LocalCoherenceLanczos.Orthogonalise();
+    std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
+  }
+
+  if ( Params.doFineRead ) { 
+    _LocalCoherenceLanczos.checkpointFineRestore(std::string("evecs.scidac"),std::string("evals.xml"));
+    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    _LocalCoherenceLanczos.Orthogonalise();
+  }
+
+  if ( Params.doCoarse ) {
+    std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
+    _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
+			      coarse.Nstop, coarse.Nk,coarse.Nm,
+			      coarse.resid, coarse.MaxIt, 
+			      coarse.betastp,coarse.MinRes);
+
+
+    std::cout << GridLogIRL<<"Checkpointing coarse evecs"<<std::endl;
+    _LocalCoherenceLanczos.checkpointCoarse(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"));
+  }
+
+  if ( Params.doCoarseRead ) {
+    // Verify we can reread ???
+    _LocalCoherenceLanczos.checkpointCoarseRestore(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"),coarse.Nstop);
+    _LocalCoherenceLanczos.testCoarse(coarse.resid*100.0,Params.Smoother,Params.coarse_relax_tol); // Coarse check
+  }
+  Grid_finalize();
+}
+
diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
index dd955054..244de3bc 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -56,12 +56,12 @@ public:
 
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
-    assert(this->_Aggregate.subspace.size()==nbasis);
+    assert(this->subspace.size()==nbasis);
     emptyUserRecord record;
-    Grid::ScidacWriter WR;
+    Grid::QCD::ScidacWriter WR(this->_FineGrid->IsBoss());
     WR.open(evecs_file);
     for(int k=0;k<nbasis;k++) {
-      WR.writeScidacFieldRecord(this->_Aggregate.subspace[k],record);
+      WR.writeScidacFieldRecord(this->subspace[k],record);
     }
     WR.close();
     
@@ -72,7 +72,7 @@ public:
   void checkpointFineRestore(std::string evecs_file,std::string evals_file)
   {
     this->evals_fine.resize(nbasis);
-    this->_Aggregate.subspace.resize(nbasis,this->_FineGrid);
+    this->subspace.resize(nbasis,this->_FineGrid);
     
     std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
     XmlReader RDx(evals_file);
@@ -85,9 +85,8 @@ public:
     Grid::ScidacReader RD ;
     RD.open(evecs_file);
     for(int k=0;k<nbasis;k++) {
-      this->_Aggregate.subspace[k].Checkerboard()=this->_checkerboard;
-      RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record);
-      
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
     }
     RD.close();
   }
@@ -96,7 +95,7 @@ public:
   {
     int n = this->evec_coarse.size();
     emptyUserRecord record;
-    Grid::ScidacWriter WR;
+    Grid::QCD::ScidacWriter WR(this->_CoarseGrid->IsBoss());
     WR.open(evecs_file);
     for(int k=0;k<n;k++) {
       WR.writeScidacFieldRecord(this->evec_coarse[k],record);
@@ -180,7 +179,6 @@ int main (int argc, char ** argv) {
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
   GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
-  GridRedBlackCartesian * CoarseGrid5rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid5);
 
   // Gauge field
   LatticeGaugeField Umu(UGrid);
@@ -206,7 +204,7 @@ int main (int argc, char ** argv) {
 
   const int nbasis= 60;
   assert(nbasis==Ns1);
-  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
   std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
 
   assert( (Params.doFine)||(Params.doFineRead));
@@ -221,7 +219,9 @@ int main (int argc, char ** argv) {
     std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
     _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
     _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
     _LocalCoherenceLanczos.Orthogonalise();
+    std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
   }
 
   if ( Params.doFineRead ) { 
@@ -231,8 +231,6 @@ int main (int argc, char ** argv) {
   }
 
   if ( Params.doCoarse ) {
-    std::cout << GridLogMessage << "Orthogonalising " << nbasis<<" Nm "<<Nm2<< std::endl;
-    
     std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
     _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
 			      coarse.Nstop, coarse.Nk,coarse.Nm,
diff --git a/tests/qdpxx/Makefile.am b/tests/qdpxx/Makefile.am
index f7f30b85..bbcd0412 100644
--- a/tests/qdpxx/Makefile.am
+++ b/tests/qdpxx/Makefile.am
@@ -1,4 +1,5 @@
 AM_CXXFLAGS += `chroma-config --cxxflags`
-AM_LDFLAGS  += `chroma-config --ldflags` `chroma-config --libs`
+AM_LDFLAGS  += `chroma-config --ldflags`
+LIBS += `chroma-config --libs`
 
 include Make.inc
diff --git a/tests/qdpxx/Makefile.am.qdpxx b/tests/qdpxx/Makefile.am.qdpxx
index 3ccfa4b8..f212413f 100644
--- a/tests/qdpxx/Makefile.am.qdpxx
+++ b/tests/qdpxx/Makefile.am.qdpxx
@@ -1,6 +1,7 @@
 # additional include paths necessary to compile the C++ library
 
 AM_CXXFLAGS = -I$(top_srcdir)/include `chroma-config --cxxflags`
-AM_LDFLAGS = -L$(top_builddir)/lib `chroma-config --ldflags` `chroma-config --libs`
+AM_LDFLAGS = -L$(top_builddir)/lib `chroma-config --ldflags` 
+AM_LIBS = `chroma-config --libs`
 
 include Make.inc
diff --git a/tests/qdpxx/Test_qdpxx_wilson.cc b/tests/qdpxx/Test_qdpxx_wilson.cc
new file mode 100644
index 00000000..29e9c9ce
--- /dev/null
+++ b/tests/qdpxx/Test_qdpxx_wilson.cc
@@ -0,0 +1,519 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/qdpxx/Test_qdpxx_wilson.cc
+
+    Copyright (C) 2017
+
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/Grid.h>
+#include <chroma.h>
+#include <actions/ferm/invert/syssolver_linop_cg_array.h>
+#include <actions/ferm/invert/syssolver_linop_aggregate.h>
+
+// Mass
+double mq = 0.1;
+
+// Define Wilson Types
+typedef Grid::QCD::WilsonImplR::FermionField FermionField;
+typedef Grid::QCD::LatticeGaugeField GaugeField;
+
+enum ChromaAction
+{
+  Wilson,      // Wilson
+  WilsonClover // CloverFermions
+};
+
+namespace Chroma
+{
+
+class ChromaWrapper
+{
+public:
+  typedef multi1d<LatticeColorMatrix> U;
+  typedef LatticeFermion T4;
+
+  static void ImportGauge(GaugeField &gr,
+                          QDP::multi1d<QDP::LatticeColorMatrix> &ch)
+  {
+    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd = gr._grid->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+            Grid::peekSite(LCM, gr, x);
+
+            for (int mu = 0; mu < 4; mu++)
+            {
+              for (int i = 0; i < 3; i++)
+              {
+                for (int j = 0; j < 3; j++)
+                {
+                  cc = LCM(mu)()(i, j);
+                  c = QDP::cmplx(QDP::Real(real(cc)), QDP::Real(imag(cc)));
+                  QDP::pokeColor(cm, c, i, j);
+                }
+              }
+              QDP::pokeSite(ch[mu], cm, cx);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void ExportGauge(GaugeField &gr,
+                          QDP::multi1d<QDP::LatticeColorMatrix> &ch)
+  {
+    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd = gr._grid->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+
+            for (int mu = 0; mu < 4; mu++)
+            {
+              for (int i = 0; i < 3; i++)
+              {
+                for (int j = 0; j < 3; j++)
+                {
+                  cm = QDP::peekSite(ch[mu], cx);
+                  c = QDP::peekColor(cm, i, j);
+                  cc = Grid::Complex(toDouble(real(c)), toDouble(imag(c)));
+                  LCM(mu)
+                  ()(i, j) = cc;
+                }
+              }
+            }
+            Grid::pokeSite(LCM, gr, x);
+          }
+        }
+      }
+    }
+  }
+
+  // Specific for Wilson Fermions
+  static void ImportFermion(Grid::QCD::LatticeFermion &gr,
+                            QDP::LatticeFermion &ch)
+  {
+    Grid::QCD::SpinColourVector F;
+    Grid::Complex c;
+
+    QDP::Fermion cF;
+    QDP::SpinVector cS;
+    QDP::Complex cc;
+
+    std::vector<int> x(4); // explicit 4d fermions in Grid
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd = gr._grid->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+
+            Grid::peekSite(F, gr, x);
+
+            for (int j = 0; j < 3; j++)
+            {
+              for (int sp = 0; sp < 4; sp++)
+              {
+
+                c = F()(sp)(j);
+
+                cc = QDP::cmplx(QDP::Real(real(c)), QDP::Real(imag(c)));
+
+                QDP::pokeSpin(cS, cc, sp);
+              }
+              QDP::pokeColor(cF, cS, j);
+            }
+            QDP::pokeSite(ch, cF, cx);
+          }
+        }
+      }
+    }
+  }
+
+  // Specific for 4d Wilson fermions
+  static void ExportFermion(Grid::QCD::LatticeFermion &gr,
+                            QDP::LatticeFermion &ch)
+  {
+    Grid::QCD::SpinColourVector F;
+    Grid::Complex c;
+
+    QDP::Fermion cF;
+    QDP::SpinVector cS;
+    QDP::Complex cc;
+
+    std::vector<int> x(4); // 4d fermions
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd = gr._grid->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+
+            cF = QDP::peekSite(ch, cx);
+            for (int sp = 0; sp < 4; sp++)
+            {
+              for (int j = 0; j < 3; j++)
+              {
+                cS = QDP::peekColor(cF, j);
+                cc = QDP::peekSpin(cS, sp);
+                c = Grid::Complex(QDP::toDouble(QDP::real(cc)),
+                                  QDP::toDouble(QDP::imag(cc)));
+                F()
+                (sp)(j) = c;
+              }
+            }
+            Grid::pokeSite(F, gr, x);
+          }
+        }
+      }
+    }
+  }
+
+  static Handle<Chroma::UnprecLinearOperator<T4, U, U>> GetLinOp(U &u, ChromaAction params)
+  {
+    QDP::Real _mq(mq);
+    QDP::multi1d<int> bcs(QDP::Nd);
+
+    // Boundary conditions
+    bcs[0] = bcs[1] = bcs[2] = bcs[3] = 1;
+
+    if (params == Wilson)
+    {
+
+      Chroma::WilsonFermActParams p;
+      p.Mass = _mq;
+      AnisoParam_t _apar;
+      _apar.anisoP = true;
+      _apar.t_dir = 3; // in 4d
+      _apar.xi_0 = 2.0;
+      _apar.nu = 1.0;
+      p.anisoParam = _apar;
+
+      Chroma::Handle<Chroma::FermBC<T4, U, U>> fbc(new Chroma::SimpleFermBC<T4, U, U>(bcs));
+      Chroma::Handle<Chroma::CreateFermState<T4, U, U>> cfs(new Chroma::CreateSimpleFermState<T4, U, U>(fbc));
+      Chroma::UnprecWilsonFermAct S_f(cfs, p);
+      Chroma::Handle<Chroma::FermState<T4, U, U>> ffs(S_f.createState(u));
+      return S_f.linOp(ffs);
+    }
+
+    if (params == WilsonClover)
+    {
+      Chroma::CloverFermActParams p;
+      p.Mass = _mq;
+      p.clovCoeffR = QDP::Real(1.0);
+      p.clovCoeffT = QDP::Real(2.0);
+      p.u0 = QDP::Real(1.0);
+      AnisoParam_t _apar;
+      _apar.anisoP = true;
+      _apar.t_dir = 3; // in 4d
+      _apar.xi_0 = 2.0;
+      _apar.nu = 1.0;
+      p.anisoParam = _apar;
+
+      Chroma::Handle<Chroma::FermBC<T4, U, U>> fbc(new Chroma::SimpleFermBC<T4, U, U>(bcs));
+      Chroma::Handle<Chroma::CreateFermState<T4, U, U>> cfs(new Chroma::CreateSimpleFermState<T4, U, U>(fbc));
+      Chroma::UnprecCloverFermAct S_f(cfs, p);
+      Chroma::Handle<Chroma::FermState<T4, U, U>> ffs(S_f.createState(u));
+      return S_f.linOp(ffs);
+    }
+  }
+};
+} // namespace Chroma
+
+void calc_chroma(ChromaAction action, GaugeField &lat, FermionField &src, FermionField &res, int dag)
+{
+  QDP::multi1d<QDP::LatticeColorMatrix> u(4);
+  Chroma::ChromaWrapper::ImportGauge(lat, u);
+
+  QDP::LatticeFermion check;
+  QDP::LatticeFermion result;
+  QDP::LatticeFermion psi;
+
+  Chroma::ChromaWrapper::ImportFermion(src, psi);
+
+  for (int mu = 0; mu < 4; mu++)
+  {
+    std::cout << "Imported Gauge norm [" << mu << "] " << QDP::norm2(u[mu]) << std::endl;
+  }
+  std::cout << "Imported Fermion norm " << QDP::norm2(psi) << std::endl;
+
+  typedef QDP::LatticeFermion T;
+  typedef QDP::multi1d<QDP::LatticeColorMatrix> U;
+
+  auto linop = Chroma::ChromaWrapper::GetLinOp(u, action);
+
+  printf("Calling Chroma Linop\n");
+  fflush(stdout);
+
+  if (dag)
+    (*linop)(check, psi, Chroma::MINUS);
+  else
+    (*linop)(check, psi, Chroma::PLUS);
+
+  printf("Called Chroma Linop\n");
+  fflush(stdout);
+
+  // std::cout << "Calling Chroma Linop " << std::endl;
+  // linop->evenEvenLinOp(tmp, psi, isign);
+  // check[rb[0]] = tmp;
+  // linop->oddOddLinOp(tmp, psi, isign);
+  // check[rb[1]] = tmp;
+  // linop->evenOddLinOp(tmp, psi, isign);
+  // check[rb[0]] += tmp;
+  // linop->oddEvenLinOp(tmp, psi, isign);
+  // check[rb[1]] += tmp;
+
+  Chroma::ChromaWrapper::ExportFermion(res, check);
+}
+
+void make_gauge(GaugeField &Umu, FermionField &src)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+
+  Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid;
+  Grid::GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  Grid::QCD::SU3::HotConfiguration(RNG4, Umu);
+
+  // Fermion field
+  Grid::gaussian(RNG4, src);
+  /*
+  Grid::QCD::SpinColourVector F;
+  Grid::Complex c;
+
+  
+
+  std::vector<int> x(4); // 4d fermions
+  std::vector<int> gd = src._grid->GlobalDimensions();
+
+  for (x[0] = 0; x[0] < gd[0]; x[0]++)
+  {
+    for (x[1] = 0; x[1] < gd[1]; x[1]++)
+    {
+      for (x[2] = 0; x[2] < gd[2]; x[2]++)
+      {
+        for (x[3] = 0; x[3] < gd[3]; x[3]++)
+        {
+          for (int sp = 0; sp < 4; sp++)
+          {
+            for (int j = 0; j < 3; j++) // colours
+            {
+              F()(sp)(j) = Grid::Complex(0.0,0.0);
+              if (((sp == 0)|| (sp==3)) && (j==2))
+              {
+                c = Grid::Complex(1.0, 0.0);
+                F()(sp)(j) = c;
+              }
+            }
+          }
+          Grid::pokeSite(F, src, x);
+          
+        }
+      }
+    }
+  }
+  */
+}
+
+void calc_grid(ChromaAction action, Grid::QCD::LatticeGaugeField &Umu, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res, int dag)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid;
+  Grid::GridRedBlackCartesian *UrbGrid = Grid::QCD::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  Grid::RealD _mass = mq;
+
+  if (action == Wilson)
+  {
+    WilsonAnisotropyCoefficients anis;
+    anis.isAnisotropic = true;
+    anis.t_direction = 3;
+    anis.xi_0 = 2.0;
+    anis.nu = 1.0;
+    WilsonImplParams iParam;
+    Grid::QCD::WilsonFermionR Wf(Umu, *UGrid, *UrbGrid, _mass, iParam, anis);
+
+    std::cout << Grid::GridLogMessage << " Calling Grid Wilson Fermion multiply " << std::endl;
+
+    if (dag)
+      Wf.Mdag(src, res);
+    else
+      Wf.M(src, res);
+    return;
+  }
+
+  if (action == WilsonClover)
+  {
+    Grid::RealD _csw_r = 1.0;
+    Grid::RealD _csw_t = 2.0;
+    WilsonAnisotropyCoefficients anis;
+    anis.isAnisotropic = true;
+    anis.t_direction = 3;
+    anis.xi_0 = 2.0;
+    anis.nu = 1.0;
+    WilsonImplParams CloverImplParam;
+    Grid::QCD::WilsonCloverFermionR Wf(Umu, *UGrid, *UrbGrid, _mass, _csw_r, _csw_t, anis, CloverImplParam);
+    Wf.ImportGauge(Umu);
+
+    std::cout << Grid::GridLogMessage << " Calling Grid Wilson Clover Fermion multiply " << std::endl;
+
+    if (dag)
+      Wf.Mdag(src, res);
+    else
+      Wf.M(src, res);
+    return;
+  }
+
+  assert(0);
+}
+
+int main(int argc, char **argv)
+{
+
+  /********************************************************
+   * Setup QDP
+   *********************************************************/
+  Chroma::initialize(&argc, &argv);
+  Chroma::WilsonTypeFermActs4DEnv::registerAll();
+
+  /********************************************************
+   * Setup Grid
+   *********************************************************/
+  Grid::Grid_init(&argc, &argv);
+  Grid::GridCartesian *UGrid = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(),
+                                                                         Grid::GridDefaultSimd(Grid::QCD::Nd, Grid::vComplex::Nsimd()),
+                                                                         Grid::GridDefaultMpi());
+
+  std::vector<int> gd = UGrid->GlobalDimensions();
+  QDP::multi1d<int> nrow(QDP::Nd);
+  for (int mu = 0; mu < 4; mu++)
+    nrow[mu] = gd[mu];
+
+  QDP::Layout::setLattSize(nrow);
+  QDP::Layout::create();
+
+  GaugeField Ug(UGrid);
+  FermionField src(UGrid);
+  FermionField res_chroma(UGrid);
+  FermionField res_grid(UGrid);
+  FermionField only_wilson(UGrid);
+  FermionField difference(UGrid);
+
+  std::vector<ChromaAction> ActionList({Wilson, WilsonClover});
+  std::vector<std::string> ActionName({"Wilson", "WilsonClover"});
+
+  {
+
+    for (int i = 0; i < ActionList.size(); i++)
+    {
+      std::cout << "*****************************" << std::endl;
+      std::cout << "Action " << ActionName[i] << std::endl;
+      std::cout << "*****************************" << std::endl;
+      make_gauge(Ug, src); // fills the gauge field and the fermion field with random numbers
+
+      for (int dag = 0; dag < 2; dag++)
+      {
+
+        {
+
+          std::cout << "Dag =  " << dag << std::endl;
+
+          calc_chroma(ActionList[i], Ug, src, res_chroma, dag);
+
+          // Remove the normalisation of Chroma Gauge links ????????
+          std::cout << "Norm of Chroma " << ActionName[i] << " multiply " << Grid::norm2(res_chroma) << std::endl;
+          calc_grid(ActionList[i], Ug, src, res_grid, dag);
+
+          std::cout << "Norm of gauge " << Grid::norm2(Ug) << std::endl;
+
+          std::cout << "Norm of Grid " << ActionName[i] << " multiply " << Grid::norm2(res_grid) << std::endl;
+
+          difference = res_chroma - res_grid;
+          std::cout << "Norm of difference " << Grid::norm2(difference) << std::endl;
+        }
+      }
+
+      std::cout << "Finished test " << std::endl;
+
+      Chroma::finalize();
+    }
+  }
+}
diff --git a/tests/solver/Test_dwf_mrhs_cg.cc b/tests/solver/Test_dwf_mrhs_cg.cc
index 223317d6..982a8247 100644
--- a/tests/solver/Test_dwf_mrhs_cg.cc
+++ b/tests/solver/Test_dwf_mrhs_cg.cc
@@ -114,7 +114,7 @@ int main (int argc, char ** argv)
 
   {
     FGrid->Barrier();
-    ScidacWriter _ScidacWriter;
+    ScidacWriter _ScidacWriter(FGrid->IsBoss());
     _ScidacWriter.open(file);
     std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
     std::cout << GridLogMessage << " Writing out gauge field "<<std::endl;
@@ -144,7 +144,7 @@ int main (int argc, char ** argv)
       std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
 
       std::stringstream filefn;      filefn << filef << "."<< n;
-      ScidacWriter _ScidacWriter;
+      ScidacWriter _ScidacWriter(FGrid->IsBoss());
       _ScidacWriter.open(filefn.str());
       _ScidacWriter.writeScidacFieldRecord(src[n],record);
       _ScidacWriter.close();
diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
index d41d8595..556f4cbf 100644
--- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc
+++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc
@@ -38,6 +38,7 @@ int main (int argc, char ** argv)
   typedef typename DomainWallFermionR::ComplexField ComplexField; 
   typename DomainWallFermionR::ImplParams params; 
 
+  double stp=1.0e-5;
   const int Ls=4;
 
   Grid_init(&argc,&argv);
@@ -197,7 +198,7 @@ int main (int argc, char ** argv)
 
   MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
   MdagMLinearOperator<DomainWallFermionR,FermionField> HermOpCk(Dchk);
-  ConjugateGradient<FermionField> CG((1.0e-2),10000);
+  ConjugateGradient<FermionField> CG((stp),10000);
   s_res = Zero();
   CG(HermOp,s_src,s_res);
 
@@ -227,5 +228,11 @@ int main (int argc, char ** argv)
     std::cout << GridLogMessage<<" resid["<<n<<"]  "<< norm2(tmp)/norm2(src[n])<<std::endl;
   }
 
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  int blockDim = 0;//not used for BlockCGVec
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGVec,blockDim,stp,10000);
+  BCGV.PrintInterval=10;
+  BCGV(HermOpCk,src,result);
+
   Grid_finalize();
 }
diff --git a/tests/solver/Test_mobius_bcg.cc b/tests/solver/Test_mobius_bcg.cc
new file mode 100644
index 00000000..e59cb7e0
--- /dev/null
+++ b/tests/solver/Test_mobius_bcg.cc
@@ -0,0 +1,220 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename MobiusFermionR::FermionField FermionField; 
+  typedef typename MobiusFermionR::ComplexField ComplexField; 
+  typename MobiusFermionR::ImplParams params; 
+
+  const int Ls=12;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> mpi_split (mpi_layout.size(),1);
+  std::vector<int> split_coor (mpi_layout.size(),1);
+  std::vector<int> split_dim (mpi_layout.size(),1);
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  /////////////////////////////////////////////
+  // Split into 1^4 mpi communicators
+  /////////////////////////////////////////////
+
+  for(int i=0;i<argc;i++){
+    if(std::string(argv[i]) == "--split"){
+      for(int k=0;k<mpi_layout.size();k++){
+	std::stringstream ss; 
+	ss << argv[i+1+k]; 
+	ss >> mpi_split[k];
+      }
+      break;
+    }
+  }
+
+ 
+  double stp = 1.e-8;
+  int nrhs = 1;
+  int me;
+  for(int i=0;i<mpi_layout.size();i++){
+//	split_dim[i] = (mpi_layout[i]/mpi_split[i]);
+	nrhs *= (mpi_layout[i]/mpi_split[i]);
+//	split_coor[i] = FGrid._processor_coor[i]/mpi_split[i];
+  }
+  std::cout << GridLogMessage << "Creating split grids " <<std::endl;
+  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
+						    GridDefaultSimd(Nd,vComplex::Nsimd()),
+						    mpi_split,
+						    *UGrid,me); 
+  std::cout << GridLogMessage <<"Creating split ferm grids " <<std::endl;
+
+  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,SGrid);
+  std::cout << GridLogMessage <<"Creating split rb grids " <<std::endl;
+  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
+  std::cout << GridLogMessage <<"Creating split ferm rb grids " <<std::endl;
+  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid);
+  std::cout << GridLogMessage << "Made the grids"<<std::endl;
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  if(0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " "<<file<<" successfully read" <<std::endl;
+  } else {
+    GridParallelRNG pRNG(UGrid );  
+    std::cout << GridLogMessage << "Intialising 4D RNG "<<std::endl;
+    pRNG.SeedFixedIntegers(seeds);
+    std::cout << GridLogMessage << "Intialised 4D RNG "<<std::endl;
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+    std::cout << " Site zero "<< Umu._odata[0]   <<std::endl;
+  } 
+
+  /////////////////
+  // MPI only sends
+  /////////////////
+  LatticeGaugeField s_Umu(SGrid);
+  FermionField s_src(SFGrid);
+  FermionField s_tmp(SFGrid);
+  FermionField s_res(SFGrid);
+
+  std::cout << GridLogMessage << "Made the split grid fields"<<std::endl;
+  ///////////////////////////////////////////////////////////////
+  // split the source out using MPI instead of I/O
+  ///////////////////////////////////////////////////////////////
+  Grid_split  (Umu,s_Umu);
+  Grid_split  (src,s_src);
+  std::cout << GridLogMessage << " split rank  " <<me << " s_src "<<norm2(s_src)<<std::endl;
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  //  RealD mass=0.00107;
+  RealD mass=0.1;
+  RealD M5=1.8;
+  RealD mobius_factor=32./12.;
+  RealD mobius_b=0.5*(mobius_factor+1.);
+  RealD mobius_c=0.5*(mobius_factor-1.);
+  MobiusFermionR Dchk(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,mobius_b,mobius_c,params);
+  MobiusFermionR Ddwf(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,mobius_b,mobius_c,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<MobiusFermionR,FermionField> HermOp(Ddwf);
+  MdagMLinearOperator<MobiusFermionR,FermionField> HermOpCk(Dchk);
+  ConjugateGradient<FermionField> CG((stp),100000);
+  s_res = zero;
+
+  CG(HermOp,s_src,s_res);
+
+  std::cout << GridLogMessage << " split residual norm "<<norm2(s_res)<<std::endl;
+  /////////////////////////////////////////////////////////////
+  // Report how long they all took
+  /////////////////////////////////////////////////////////////
+  std::vector<uint32_t> iterations(nrhs,0);
+  iterations[me] = CG.IterationsToComplete;
+    
+  for(int n=0;n<nrhs;n++){
+    UGrid->GlobalSum(iterations[n]);
+    std::cout << GridLogMessage<<" Rank "<<n<<" "<< iterations[n]<<" CG iterations"<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Gather and residual check on the results
+  /////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage<< "Unsplitting the result"<<std::endl;
+  Grid_unsplit(result,s_res);
+
+
+  std::cout << GridLogMessage<< "Checking the residuals"<<std::endl;
+  for(int n=0;n<nrhs;n++){
+    std::cout << GridLogMessage<< " res["<<n<<"] norm "<<norm2(result[n])<<std::endl;
+    HermOpCk.HermOp(result[n],tmp); tmp = tmp - src[n];
+    std::cout << GridLogMessage<<" resid["<<n<<"]  "<< std::sqrt(norm2(tmp)/norm2(src[n]))<<std::endl;
+  }
+
+
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOpCk,src,result);
+  }
+
+  
+  
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_mobius_bcg_nosplit.cc b/tests/solver/Test_mobius_bcg_nosplit.cc
new file mode 100644
index 00000000..f3ed621f
--- /dev/null
+++ b/tests/solver/Test_mobius_bcg_nosplit.cc
@@ -0,0 +1,144 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 2;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 0;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+      result[s]=zero;
+   }
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOp,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_mobius_bcg_phys_nosplit.cc b/tests/solver/Test_mobius_bcg_phys_nosplit.cc
new file mode 100644
index 00000000..15617a05
--- /dev/null
+++ b/tests/solver/Test_mobius_bcg_phys_nosplit.cc
@@ -0,0 +1,148 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 2;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src4(nrhs,UGrid);
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG4(UGrid);  pRNG4.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG4,src4[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 0;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+  for(int s=0;s<nrhs;s++) {
+    Ddwf.ImportPhysicalFermionSource(src4[s],src[s]);
+  }
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    //    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+  BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+  {
+    BCGV(HermOp,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_mobius_bcg_prec_nosplit.cc b/tests/solver/Test_mobius_bcg_prec_nosplit.cc
new file mode 100644
index 00000000..63078613
--- /dev/null
+++ b/tests/solver/Test_mobius_bcg_prec_nosplit.cc
@@ -0,0 +1,147 @@
+   /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_mrhs_cg.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <Grid/Grid.h>
+
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  typedef typename DomainWallFermionR::FermionField FermionField; 
+  typedef typename DomainWallFermionR::ComplexField ComplexField; 
+  typename DomainWallFermionR::ImplParams params; 
+
+  const int Ls=16;
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<ComplexD> boundary_phases(Nd,1.);
+  boundary_phases[Nd-1]=-1.;
+  params.boundary_phases = boundary_phases;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * rbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+ 
+  double stp = 1.e-8;
+  int nrhs = 4;
+
+  ///////////////////////////////////////////////
+  // Set up the problem as a 4d spreadout job
+  ///////////////////////////////////////////////
+  std::vector<int> seeds({1,2,3,4});
+
+  std::vector<FermionField> src(nrhs,FGrid);
+  std::vector<FermionField> src_chk(nrhs,FGrid);
+  std::vector<FermionField> result(nrhs,FGrid);
+  FermionField tmp(FGrid);
+  std::cout << GridLogMessage << "Made the Fermion Fields"<<std::endl;
+
+  for(int s=0;s<nrhs;s++) result[s]=zero;
+  GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
+  for(int s=0;s<nrhs;s++) {
+    random(pRNG5,src[s]);
+    std::cout << GridLogMessage << " src ["<<s<<"] "<<norm2(src[s])<<std::endl;
+  }
+
+  std::cout << GridLogMessage << "Intialised the Fermion Fields"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); 
+
+  int conf = 2;
+  if(conf==0) { 
+    FieldMetaData header;
+    std::string file("./lat.in");
+    NerscIO::readConfiguration(Umu,header,file);
+    std::cout << GridLogMessage << " Config "<<file<<" successfully read" <<std::endl;
+  } else if (conf==1){
+    GridParallelRNG pRNG(UGrid );  
+
+    pRNG.SeedFixedIntegers(seeds);
+    SU3::HotConfiguration(pRNG,Umu);
+    std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<<std::endl;
+  } else {
+    SU3::ColdConfiguration(Umu);
+    std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<<std::endl;
+  }
+
+  ///////////////////////////////////////////////////////////////
+  // Set up N-solvers as trivially parallel
+  ///////////////////////////////////////////////////////////////
+  std::cout << GridLogMessage << " Building the solvers"<<std::endl;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*rbGrid,mass,M5,params);
+
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling DWF CG "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+
+  MdagMLinearOperator<DomainWallFermionR,FermionField> HermOp(Ddwf);
+  ConjugateGradient<FermionField> CG((stp),100000);
+
+  for(int rhs=0;rhs<1;rhs++){
+    result[rhs] = zero;
+    CG(HermOp,src[rhs],result[rhs]);
+  }
+
+  for(int rhs=0;rhs<1;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  /////////////////////////////////////////////////////////////
+  // Try block CG
+  /////////////////////////////////////////////////////////////
+  int blockDim = 0;//not used for BlockCGVec
+  for(int s=0;s<nrhs;s++){
+    result[s]=zero;
+  }
+
+
+  {
+    BlockConjugateGradient<FermionField>    BCGV  (BlockCGrQVec,blockDim,stp,100000);
+    SchurRedBlackDiagTwoSolve<FermionField> SchurSolver(BCGV);
+    SchurSolver(Ddwf,src,result);
+  }
+  
+  for(int rhs=0;rhs<nrhs;rhs++){
+    std::cout << " Result["<<rhs<<"] norm = "<<norm2(result[rhs])<<std::endl;
+  }
+
+  Grid_finalize();
+}
diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc
index 8ca7c563..49e8f2d3 100644
--- a/tests/solver/Test_staggered_block_cg_prec.cc
+++ b/tests/solver/Test_staggered_block_cg_prec.cc
@@ -67,34 +67,70 @@ int main (int argc, char ** argv)
   GridParallelRNG pRNG(UGrid );  pRNG.SeedFixedIntegers(seeds);
   GridParallelRNG pRNG5(FGrid);  pRNG5.SeedFixedIntegers(seeds);
 
-  FermionField src(FGrid); random(pRNG5,src);
+  FermionField src(FGrid);
+  FermionField tt(FGrid);
+#if 1
+  random(pRNG5,src);
+#else
+  src=zero;
+  ComplexField coor(FGrid);
+  LatticeCoordinate(coor,0);
+  for(int ss=0;ss<FGrid->oSites();ss++){
+    src._odata[ss]()()(0)=coor._odata[ss]()()();
+  }
+  LatticeCoordinate(coor,1);
+  for(int ss=0;ss<FGrid->oSites();ss++){
+    src._odata[ss]()()(0)+=coor._odata[ss]()()();
+  }
+#endif
   FermionField src_o(FrbGrid);   pickCheckerboard(Odd,src_o,src);
   FermionField result_o(FrbGrid); result_o=Zero(); 
   RealD nrm = norm2(src);
 
   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
 
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
   RealD mass=0.003;
-  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); 
   SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
   int blockDim = 0;
   BlockConjugateGradient<FermionField>    BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000);
-  BlockConjugateGradient<FermionField>    BCG  (BlockCG,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCG  (BlockCGrQ,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCGv (BlockCGrQVec,blockDim,1.0e-8,10000);
   BlockConjugateGradient<FermionField>    mCG  (CGmultiRHS,blockDim,1.0e-8,10000);
 
   std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
   std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
-  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
+  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
   SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
   FermionField src4d(UGrid); random(pRNG,src4d);
   FermionField src4d_o(UrbGrid);   pickCheckerboard(Odd,src4d_o,src4d);
   FermionField result4d_o(UrbGrid); 
 
   result4d_o=Zero();
-  CG(HermOp4d,src4d_o,result4d_o);
+  double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 +  == 1146
+  {
+    double t1=usecond();
+    CG(HermOp4d,src4d_o,result4d_o);
+    double t2=usecond();
+    double ncall=CG.IterationsToComplete;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp4d.Report();
+  }
+  Ds4d.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
 
@@ -103,7 +139,17 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   Ds.ZeroCounters();
   result_o=Zero();
-  CG(HermOp,src_o,result_o);
+  {
+    double t1=usecond();
+    CG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=CG.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
   Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
@@ -112,7 +158,37 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   Ds.ZeroCounters();
   result_o=Zero();
-  mCG(HermOp,src_o,result_o);
+  {
+    double t1=usecond();
+    mCG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=mCG.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
+
+  Ds.Report();
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << " Calling Block CGrQ for "<<Ls <<" right hand sides" <<std::endl;
+  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  Ds.ZeroCounters();
+  result_o=Zero();
+  {
+    double t1=usecond();
+    BCGrQ(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=BCGrQ.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
   Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
@@ -120,11 +196,45 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   Ds.ZeroCounters();
-  result_o=Zero();
-  BCGrQ(HermOp,src_o,result_o);
+  result_o=zero;
+  {
+    double t1=usecond();
+    BCG(HermOp,src_o,result_o);
+    double t2=usecond();
+    double ncall=BCGrQ.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    HermOp.Report();
+  }
   Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::cout << GridLogMessage << " Calling BCGvec "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
+  std::vector<FermionField> src_v   (Ls,UrbGrid);
+  std::vector<FermionField> result_v(Ls,UrbGrid);
+  for(int s=0;s<Ls;s++) result_v[s] = zero;
+  for(int s=0;s<Ls;s++) {
+    FermionField src4(UGrid);
+    ExtractSlice(src4,src,s,0);
+    pickCheckerboard(Odd,src_v[s],src4);  
+  }
+
+  {
+    double t1=usecond();
+    BCGv(HermOp4d,src_v,result_v);
+    double t2=usecond();
+    double ncall=BCGv.IterationsToComplete*Ls;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+    //    HermOp4d.Report();
+  }
+
 
   Grid_finalize();
 }
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index 798a6efc..b1797355 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -74,7 +74,16 @@ int main (int argc, char ** argv)
   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
 
   RealD mass=0.003;
-  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); 
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }
+
+  ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); 
   MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
@@ -86,11 +95,23 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
   std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
-  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
+  ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
   FermionField src4d(UGrid); random(pRNG,src4d);
   FermionField result4d(UGrid); result4d=Zero();
-  CG(HermOp4d,src4d,result4d);
+
+  double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 +  == 1146
+  {
+    double t1=usecond();
+    CG(HermOp4d,src4d,result4d);
+    double t2=usecond();
+    double ncall=CG.IterationsToComplete;
+    double flops = deodoe_flops * ncall;
+    std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+   }
+
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
 
@@ -98,9 +119,18 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=Zero();
+{
   Ds.ZeroCounters();
+  double t1=usecond();
   CG(HermOp,src,result);
+  double t2=usecond();
+  double ncall=CG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+    std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
   Ds.Report();
+}
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@@ -108,7 +138,16 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=Zero();
   Ds.ZeroCounters();
+{
+  double t1=usecond();
   mCG(HermOp,src,result);
+  double t2=usecond();
+  double ncall=CG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+}
   Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
@@ -117,7 +156,16 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=Zero();
   Ds.ZeroCounters();
+{
+  double t1=usecond();
   BCGrQ(HermOp,src,result);
+  double t2=usecond();
+  double ncall=CG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+}
   Ds.Report();
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
 
diff --git a/tests/solver/Test_staggered_cg_prec.cc b/tests/solver/Test_staggered_cg_prec.cc
index aa293457..94373301 100644
--- a/tests/solver/Test_staggered_cg_prec.cc
+++ b/tests/solver/Test_staggered_cg_prec.cc
@@ -71,7 +71,10 @@ int main (int argc, char ** argv)
   }  
   
   RealD mass=0.003;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
 
   FermionField res_o(&RBGrid); 
   FermionField src_o(&RBGrid); 
@@ -80,7 +83,19 @@ int main (int argc, char ** argv)
 
   SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
+  double t1=usecond();
   CG(HermOpEO,src_o,res_o);
+  double t2=usecond();
+
+  // Schur solver: uses DeoDoe => volume * 1146
+  double ncall=CG.IterationsToComplete;
+  double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146
+
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+
+
 
   FermionField tmp(&RBGrid);
 
diff --git a/tests/solver/Test_staggered_cg_schur.cc b/tests/solver/Test_staggered_cg_schur.cc
index 849eb897..81a2a0fa 100644
--- a/tests/solver/Test_staggered_cg_schur.cc
+++ b/tests/solver/Test_staggered_cg_schur.cc
@@ -65,7 +65,10 @@ int main (int argc, char ** argv)
   FermionField  resid(&Grid); 
 
   RealD mass=0.1;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
   SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc
index 156e3418..9625a9c8 100644
--- a/tests/solver/Test_staggered_cg_unprec.cc
+++ b/tests/solver/Test_staggered_cg_unprec.cc
@@ -73,7 +73,10 @@ int main (int argc, char ** argv)
   }  
   
   RealD mass=0.1;
-  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
 
   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
   ConjugateGradient<FermionField> CG(1.0e-6,10000);
diff --git a/tests/solver/Test_staggered_multishift.cc b/tests/solver/Test_staggered_multishift.cc
new file mode 100644
index 00000000..04386027
--- /dev/null
+++ b/tests/solver/Test_staggered_multishift.cc
@@ -0,0 +1,121 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_wilson_cg_unprec.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  typedef typename ImprovedStaggeredFermionR::FermionField FermionField; 
+  typename ImprovedStaggeredFermionR::ImplParams params; 
+
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+
+  LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  ////////////////////////////////////////
+  // sqrt 
+  ////////////////////////////////////////
+  double     lo=0.001;
+  double     hi=1.0;
+  int precision=64;
+  int    degree=10;
+  AlgRemez remez(lo,hi,precision);
+  remez.generateApprox(degree,1,2);
+  MultiShiftFunction Sqrt(remez,1.0e-6,false);
+  std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
+
+
+  ////////////////////////////////////////////
+  // Setup staggered
+  ////////////////////////////////////////////
+  RealD mass=0.003;
+  RealD c1=9.0/8.0;
+  RealD c2=-1.0/24.0;
+  RealD u0=1.0;
+
+  ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
+  SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
+
+  FermionField src(&Grid); random(pRNG,src);
+  FermionField src_o(&RBGrid); 
+  pickCheckerboard(Odd,src_o,src);
+
+
+  /////////////////////////////////
+  //Multishift CG
+  /////////////////////////////////
+  std::vector<FermionField> result(degree,&RBGrid);
+  ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
+
+  double deodoe_flops=(1205+15*degree)*volume; // == 66*16 +  == 1146
+
+  double t1=usecond();
+  MSCG(HermOpEO,src_o,result);
+  double t2=usecond();
+  double ncall=MSCG.IterationsToComplete;
+  double flops = deodoe_flops * ncall;
+  std::cout<<GridLogMessage << "usec    =   "<< (t2-t1)<<std::endl;
+  std::cout<<GridLogMessage << "flops   =   "<< flops<<std::endl;
+  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t2-t1)<<std::endl;
+  //  HermOpEO.Report();
+
+  Grid_finalize();
+}